Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
Merge pull request #185 from apache/python3
Update scripts for Python 3
  • Loading branch information
buggtb committed Mar 29, 2019
2 parents c77e553 + e4571a6 commit d0aae2f52b4feb95b11fd43fd1d4f14e4c1b9d82
Showing 5 changed files with 56 additions and 67 deletions.
@@ -21,18 +21,18 @@

def main(argv=None):
if len(argv) == 0:
print "No Repo details to dump"
print("No Repo details to dump")
sys.exit()

if os.getenv("DRAT_HOME")==None:
print "Please add DRAT_HOME environment variable and try again";
print("Please add DRAT_HOME environment variable and try again");
sys.exit()

default_repo_file_url = os.getenv("DRAT_HOME") + "/conf/repo.default.txt"
with open(default_repo_file_url,'rb')as repoFile:
data = ''
for line in repoFile:
data+=line
data+=line.strip().decode('utf-8')
rep = eval(data)

reponame = os.path.basename(os.path.normpath(argv[0]))
@@ -45,7 +45,7 @@ def main(argv=None):
file.write(json.dumps(rep))
file.close()

print rep
print(rep)

if __name__ == "__main__":
main(sys.argv[1:])
@@ -26,27 +26,23 @@
import time
import shutil
import datetime
import csv
import urllib2
from urllib.request import urlopen, Request
import json
import xmlrpclib
import getopt
import glob
import md5
import xmlrpc

# Check for environment variables
def check_env_var():
if os.getenv("DRAT_HOME") == None:
print "Environment variable $DRAT_HOME is not set."
print("Environment variable $DRAT_HOME is not set.")
sys.exit(1)
if os.getenv("JAVA_HOME") == None:
print "Environment variable $JAVA_HOME is not set."
print("Environment variable $JAVA_HOME is not set.")
sys.exit(1)
if os.getenv("SOLR_DRAT_URL") == None:
print "Environment variable $SOLR_DRAT_URL is not set."
print("Environment variable $SOLR_DRAT_URL is not set.")
sys.exit(1)
if os.getenv("WORKFLOW_URL") == None:
print "Environment variable $WORKFLOW_URL is not set."
print("Environment variable $WORKFLOW_URL is not set.")
sys.exit(1)


@@ -83,7 +79,7 @@ def help():

# Printing out on Console
def printnow(string):
print string
print(string)
sys.stdout.flush()


@@ -176,7 +172,7 @@ def drat_reset():
# Check if there are any pending PGE jobs in the queue
def job_in_queue(job_name):
status = "PGE EXEC"
server = xmlrpclib.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)
server = xmlrpc.client.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)


for x in range(0,6):
@@ -225,9 +221,9 @@ def parse_license(s):
# Index into Solr
def index_solr(json_data):
printnow(json_data)
request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
urllib2.urlopen(request, json_data)
urlopen(request, json_data)


# Run DRAT and collect statistics
@@ -24,11 +24,12 @@

import sys
import json
import os
import getopt
import urllib2
import xmlrpclib
urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
import urllib
from urllib.request import urlopen, Request
from xmlrpc import client

#urllib.request.build_opener(urllib.HTTPHandler(debuglevel=1))
solrPostfix = "/select/?q=mimetype:$type&version=2.2&start=0&rows=10&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename"
solrPostfixByPage = "/select/?q=mimetype:$type&version=2.2&start=$i&rows=$num&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename"

@@ -41,26 +42,26 @@ def executeRatJobs(url, num, type, workflowUrl, taskIds):
if not url.endswith("/"):
url = url + "/"
solrUrl = url+solrPostfix.replace("$type", type)
print "GET "+solrUrl
print("GET "+solrUrl)
numFound = 0
req = urllib2.Request(solrUrl)
req = Request(solrUrl)
try:
f = urllib2.urlopen(req)
jsonResp = json.loads(f.read())
f = urlopen(req)
jsonResp = json.loads(f.read().decode('utf-8'))
numFound = int(jsonResp["response"]["numFound"])
except urllib2.HTTPError, (err):
print "HTTP error(%s)" % (err)
print "Aborting RAT execution"
except urllib.error.HTTPError as err:
print("HTTP error(%s)" % (err))
print("Aborting RAT execution")
return

wm = xmlrpclib.Server(workflowUrl)
wm = client.Server(workflowUrl)


for i in range(0, numFound, num):
ratSolrUrl = url + solrPostfixByPage.replace("$type", type).replace("$i", str(i)).replace("$num",str(num))
req = urllib2.Request(ratSolrUrl)
f = urllib2.urlopen(req)
jsonResp = json.loads(f.read())
req = Request(ratSolrUrl)
f = urlopen(req)
jsonResp = json.loads(f.read().decode('utf-8'))
docs = jsonResp["response"]["docs"]
metadata = {}
metadata["MimeType"] = type
@@ -75,13 +76,13 @@ def executeRatJobs(url, num, type, workflowUrl, taskIds):
metadata["InputFiles"] = []
metadata["InputFiles"].append(fullpath)

print "Metadata is "+str(metadata)
print("Metadata is "+str(metadata))
wm.workflowmgr.executeDynamicWorkflow([taskIds], metadata)


def get_mime_types(solrUrl):
neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
connection = urllib2.urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
connection = urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
response = eval(connection.read())
mime_count = response["facet_counts"]["facet_fields"]["mimetype"]
stats = {}
@@ -101,11 +102,11 @@ def main(argv):
try:
opts, args = getopt.getopt(argv,"hu:c:w:t:",["solrUrl=", "numFilesPerJob=", "workflowUrl=", "ratTaskId="])
except getopt.GetoptError:
print usage
print(usage)
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print usage
print(usage)
sys.exit()
elif opt in ("-u", "--solrUrl"):
solrUrl = arg
@@ -117,15 +118,15 @@ def main(argv):
ratTaskId = arg

if solrUrl == "" or numFilesPerJob == 0 or workflowUrl == "" or ratTaskId == "":
print usage
print(usage)
sys.exit()


print "Configured SOLR url: ["+solrUrl+"]"
print("Configured SOLR url: ["+solrUrl+"]")
mimeTypes = get_mime_types(solrUrl)

for type in mimeTypes:
print "Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]"
print("Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]")
executeRatJobs(solrUrl, numFilesPerJob, type, workflowUrl, ratTaskId)

if __name__ == "__main__":
@@ -25,31 +25,24 @@

import sys
import os
import getopt
import subprocess
import time
import shutil
import datetime
import csv
import urllib2

from urllib.request import urlopen, Request
import json
import xmlrpclib
import getopt
import glob
import md5
import hashlib
import requests


def parse_license(s):
li_dict = {'N': 'Notes', 'B': 'Binaries', 'A': 'Archives', 'AL': 'Apache', '!?????': 'Unknown'}
if s and not s.isspace():
arr = s.split("/", 1)
arr = s.split(b"/", 1)
li = arr[0].strip()
if li in li_dict:
li = li_dict[li]

if len(arr) > 1 and len(arr[1].split("/")) > 0:
return [arr[1].split("/")[-1], li]
if len(arr) > 1 and len(arr[1].split(b"/")) > 0:
return [arr[1].split(b"/")[-1], li]
else:
#print('split not correct during license parsing '+str(arr))
return ["/dev/null", li_dict['!?????']]
@@ -98,9 +91,9 @@ def count_num_files(path, exclude):

def index_solr(json_data):
#print(json_data)
request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
urllib2.urlopen(request, json_data)
urlopen(request, json_data.encode('utf-8'))

def main(argv=None):
usage = 'rat_aggregator.py logfile1 logfile2 ... logfileN'
@@ -110,13 +103,13 @@ def main(argv=None):
with open(repo_file_url,'rb')as repoFile:
data = ''
for line in repoFile:
data+=line
data+=line.decode('utf-8')
rep = eval(data)

index_solr(json.dumps([rep]))

if len(argv) == 0:
print usage
print(usage)
sys.exit()

totalNotes = 0
@@ -193,7 +186,7 @@ def main(argv=None):

with open(filename, 'rb') as f:
for line in f:
if '*****************************************************' in line:
if b'*****************************************************' in line:
l = 0
h = 0
if cur_section == 'licenses':
@@ -204,9 +197,9 @@ def main(argv=None):
cur_file = ''
cur_header = ''
cur_section = ''
if line.startswith(' Files with Apache') and not parsedLicenses:
if line.startswith(b' Files with Apache') and not parsedLicenses:
cur_section = 'licenses'
if line.startswith(' Printing headers for ') and not parsedHeaders:
if line.startswith(b' Printing headers for ') and not parsedHeaders:
cur_section = 'headers'
if cur_section == 'licenses':
l += 1
@@ -218,12 +211,12 @@ def main(argv=None):
rat_license[li[0]] = li[1]
#print(li)
if cur_section == 'headers':
if '=====================================================' in line or '== File:' in line:
if b'=====================================================' in line or b'== File:' in line:
h += 1
if h == 2:
cur_file = line.split("/")[-1].strip()
cur_file = line.split(b"/")[-1].strip()
if h == 3:
cur_header += line
cur_header += line.decode('utf-8')
if h == 4:
rat_header[cur_file] = cur_header.split("\n", 1)[1]
cur_file = ''
@@ -248,8 +241,7 @@ def main(argv=None):
for doc in docs:
fdata = {}
fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0])
m = md5.new()
m.update(fdata['id'])
m = hashlib.md5(fdata['id'].encode('utf-8'))
hashId = m.hexdigest()
fileId = hashId+"-"+doc['filename'][0]

@@ -275,7 +267,7 @@ def main(argv=None):

# Copying data to Output Directory
print ("Notes,Binaries,Archives,Standards,Apache,Generated,Unknown")
print str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+" ,"+str(totalGenerated)+","+str(totalUnknown)
print(str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+" ,"+str(totalGenerated)+","+str(totalUnknown))

#print("\nData copied to Solr and Output Directory: OK\n")

@@ -9,7 +9,7 @@
<cmd>echo "Creating working dirs"</cmd>
<cmd>mkdir [JobInputDir] ; mkdir [JobOutputDir]; mkdir [JobLogDir]</cmd>
<cmd>echo "Running RAT aggregator"</cmd>
<cmd>[RatAggregatorScript] `python -c "print ' '.join('[InputFiles]'.split(','))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd>
<cmd>[RatAggregatorScript] `python -c "print(' '.join('[InputFiles]'.split(',')))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd>
</exe>

<!-- Files to ingest -->

0 comments on commit d0aae2f

Please sign in to comment.