Skip to content
This repository has been archived by the owner on May 12, 2021. It is now read-only.

Commit

Permalink
This is the code for breaking dratstat
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmedifhaam committed Aug 13, 2018
1 parent d8df871 commit a24e05b
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 174 deletions.
177 changes: 4 additions & 173 deletions distribution/src/main/resources/bin/dratstats.py
Expand Up @@ -268,7 +268,7 @@ def run(repos_list, output_dir):
print("\nOODT Started: OK\n")

print('Adding repository: '+str(rep)+' to Solr')
index_solr(json.dumps([rep]))
# index_solr(json.dumps([rep]))


print("\nRunning DRAT on " + rep["repo"] + " ...\n")
Expand All @@ -295,178 +295,9 @@ def run(repos_list, output_dir):
wait_for_job("urn:drat:MimePartitioner")
wait_for_job("urn:drat:RatCodeAudit")
stats['map_end'] = current_datetime()

if retval:
time.sleep(5)
stats['reduce_start'] = current_datetime()

# Extract data from RatAggregate File
totalNotes = 0
totalBinaries = 0
totalArchives = 0
totalStandards = 0
totalApache = 0
totalGenerated = 0
totalUnknown = 0

rat_dir = os.getenv("DRAT_HOME") + "/data/archive/rat"

# Iterate over all RAT log files
for root, dirs, files in os.walk(rat_dir):
for filename in files:
if filename.endswith(".log"):
(notes, binaries, archives,standards,apachelicensed,generated,unknown) = parseFile(os.path.join(root, filename))
totalNotes = totalNotes + notes
totalBinaries = totalBinaries + binaries
totalArchives = totalArchives + archives
totalStandards = totalStandards + standards
totalApache = totalApache + apachelicensed
totalGenerated = totalGenerated + generated
totalUnknown = totalUnknown + unknown

stats["license_Notes"] = totalNotes
stats["license_Binaries"] = totalBinaries
stats["license_Archives"] = totalArchives
stats["license_Standards"] = totalStandards
stats["license_Apache"] = totalApache
stats["license_Generated"] = totalGenerated
stats["license_Unknown"] = totalUnknown

stats['reduce_end'] = current_datetime()
print "\nDRAT Scan Completed: OK\n"

time.sleep(5)

if retval:
# Copy Data with datetime variables above, extract output from RatAggregate file, extract data from Solr Core
printnow ("\nCopying data to Solr and Output Directory...\n")

# Extract data from Solr
neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
connection = urllib2.urlopen(os.getenv("SOLR_URL") + "/drat/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")

response = eval(connection.read())
mime_count = response["facet_counts"]["facet_fields"]["mimetype"]

for i in range(0, len(mime_count), 2):
if mime_count[i].split("/")[0] not in neg_mimetype:
stats["mime_" + mime_count[i]] = mime_count[i + 1]


# Count the number of files
stats["files"] = count_num_files(rep["repo"], ".git")

# Write data into Solr
stats["type"] = 'software'
stats_data = []
stats_data.append(stats)
json_data = json.dumps(stats_data)
index_solr(json_data)

# Parse RAT logs
rat_logs_dir = os.getenv("DRAT_HOME") + "/data/archive/rat/*/*.log"
rat_license = {}
rat_header = {}
for filename in glob.glob(rat_logs_dir):
#print('=' * 20)
l = 0
h = 0
cur_file = ''
cur_header = ''
cur_section = ''
parsedHeaders = False
parsedLicenses = False

with open(filename, 'rb') as f:
printnow('Parsing rat log: ['+filename+']')
for line in f:
if '*****************************************************' in line:
l = 0
h = 0
if cur_section == 'licenses':
parsedLicenses = True
if cur_section == 'headers':
parsedHeaders = True

cur_file = ''
cur_header = ''
cur_section = ''
if line.startswith(' Files with Apache') and not parsedLicenses:
cur_section = 'licenses'
if line.startswith(' Printing headers for ') and not parsedHeaders:
cur_section = 'headers'
if cur_section == 'licenses':
l += 1
if l > 4:
line = line.strip()
if line:
print("File: %s with License Line: %s" % (filename, line))
li = parse_license(line)
rat_license[li[0]] = li[1]
print(li)
if cur_section == 'headers':
if '=====================================================' in line or '== File:' in line:
h += 1
if h == 2:
cur_file = line.split("/")[-1].strip()
if h == 3:
cur_header += line
if h == 4:
rat_header[cur_file] = cur_header.split("\n", 1)[1]
cur_file = ''
cur_header = ''
h = 1
if h == 3:
rat_header[cur_file] = cur_header.split("\n", 1)[1]
parsedHeaders = True
parsedLicenses = True

# Index RAT logs into Solr
connection = urllib2.urlopen(os.getenv("SOLR_URL") +
"/drat/select?q=*%3A*&fl=filename%2Cfilelocation%2Cmimetype&wt=python&rows="
+ str(stats["files"]) +"&indent=true")
response = eval(connection.read())
docs = response['response']['docs']
file_data = []
batch = 100
dc = 0

for doc in docs:
fdata = {}
fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0])
m = md5.new()
m.update(fdata['id'])
hashId = m.hexdigest()
fileId = hashId+"-"+doc['filename'][0]

if fileId not in rat_license:
print "File: "+str(fdata['id'])+": ID: ["+fileId+"] not present in parsed licenses => Likely file copying issue. Skipping."
continue #handle issue with DRAT #93

fdata["type"] = 'file'
fdata['parent'] = rep["repo"]
fdata['mimetype'] = doc['mimetype'][0]
fdata['license'] = rat_license[fileId]
if fileId in rat_header:
fdata['header'] = rat_header[fileId]
file_data.append(fdata)
dc += 1
if dc % batch == 0:
json_data = json.dumps(file_data)
index_solr(json_data)
file_data = []
if dc % batch != 0:
json_data = json.dumps(file_data)
index_solr(json_data)

# Copying data to Output Directory
repos_out = output_dir + "/" + normalize_path(rep["repo"])
shutil.copytree(os.getenv("DRAT_HOME") + "/data", repos_out)
print("\nData copied to Solr and Output Directory: OK\n")

else:
print ("\nDRAT Scan Completed: Resulted in Error\n")

print ("\nwaiting for Rat Aggregator...\n")
wait_for_job("urn:drat:RatAggregator")


time.sleep(5)
print ("\nStopping OODT...\n")
Expand Down
3 changes: 3 additions & 0 deletions nohup.out
@@ -0,0 +1,3 @@
Started dynamic workflow with id '6453cca6-9f30-11e8-b99d-f5018c8e9233'

Navigate to http://localhost:8080/opsui/ to view the OODT browser and http://localhost:8080/solr to view the Solr catalog.
Expand Up @@ -156,7 +156,7 @@ the License.
return this.stat.crawledfiles/this.stat.numOfFiles *100;
},
indexingprogress(){
return this.stat.indexedfiles/this.stat.numberOfFiles * 100;
return this.stat.indexedfiles/this.stat.numOfFiles * 100;
}
}
}
Expand Down

0 comments on commit a24e05b

Please sign in to comment.