Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
Merge branch 'gsoc-dratstat-java' of https://github.com/ahmedifhaam/drat
 into gsoc18
  • Loading branch information
chrismattmann committed Aug 6, 2018
2 parents 2620d43 + ea03bb7 commit dc59ba0ad9d37413559c0ba0efaedfe2514f81b6
Showing 7 changed files with 285 additions and 23 deletions.
@@ -344,6 +344,7 @@ def run(repos_list, output_dir):
# Extract data from Solr
neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
connection = urllib2.urlopen(os.getenv("SOLR_URL") + "/drat/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")

response = eval(connection.read())
mime_count = response["facet_counts"]["facet_fields"]["mimetype"]

@@ -26,6 +26,36 @@
import sys
import os
import getopt
import subprocess
import time
import shutil
import datetime
import csv
import urllib2
import json
import xmlrpclib
import getopt
import glob
import md5
import requests


def parse_license(s):
li_dict = {'N': 'Notes', 'B': 'Binaries', 'A': 'Archives', 'AL': 'Apache', '!?????': 'Unknown'}
if s and not s.isspace():
arr = s.split("/", 1)
li = arr[0].strip()
if li in li_dict:
li = li_dict[li]

if len(arr) > 1 and len(arr[1].split("/")) > 0:
return [arr[1].split("/")[-1], li]
else:
#print('split not correct during license parsing '+str(arr))
return ["/dev/null", li_dict['!?????']]
else:
#print('blank line provided to parse license ['+s+']')
return ["/dev/null", li_dict['!?????']]


def parseFile(filepath):
@@ -55,15 +85,39 @@ def parseFile(filepath):
if line.find('Unknown Licenses') != -1:
unknown = unknown + int(line.split(' ')[0].strip())
return (notes, binaries,archives,standards,apachelicensed,generated,unknown)

return (-1,-1,-1,-1,-1,-1,-1)

def count_num_files(path, exclude):
count = 0
for root, dirs, files in os.walk(path):
for filename in files:
if exclude not in os.path.join(root, filename):
count += 1
return count

def index_solr(json_data):
#print(json_data)
request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
urllib2.urlopen(request, json_data)

def main(argv=None):
usage = 'rat_aggregator.py logfile1 logfile2 ... logfileN'
#print("starting rat aggregator")

repo_file_url = os.getenv("DRAT_HOME") + "/data/repo"
with open(repo_file_url,'rb')as repoFile:
data = ''
for line in repoFile:
data+=line
rep = eval(data)

index_solr(json.dumps([rep]))

if len(argv) == 0:
print usage
sys.exit()
print usage
sys.exit()

totalNotes = 0
totalBinaries = 0
@@ -83,9 +137,151 @@ def main(argv=None):
totalGenerated = totalGenerated + generated
totalUnknown = totalUnknown + unknown

print "Notes,Binaries,Archives,Standards,Apache,Generated,Unknown"
print str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+","+str(totalGenerated)+","+str(totalUnknown)
#Additionally
stats = {}
stats["license_Notes"] = totalNotes
stats["license_Binaries"] = totalBinaries
stats["license_Archives"] = totalArchives
stats["license_Standards"] = totalStandards
stats["license_Apache"] = totalApache
stats["license_Generated"] = totalGenerated
stats["license_Unknown"] = totalUnknown



stats['id'] =rep["repo"]
retVal = True

if retVal:
# Copy Data with datetime variables above, extract output from RatAggregate file, extract data from Solr Core
#print("\nCopying data to Solr and Output Directory...\n")

# Extract data from Solr
neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
connection = requests.get(os.getenv("SOLR_URL") + "/drat/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")

response = eval(connection.text)
mime_count = response["facet_counts"]["facet_fields"]["mimetype"]

for i in range(0, len(mime_count), 2):
if mime_count[i].split("/")[0] not in neg_mimetype:
stats["mime_" + mime_count[i]] = mime_count[i + 1]


# Count the number of files
stats["files"] = count_num_files(rep["repo"], ".git")

# Write data into Solr
stats["type"] = 'software'
stats_data = []
stats_data.append(stats)
json_data = json.dumps(stats_data)
index_solr(json_data)

# Parse RAT logs
rat_logs_dir = os.getenv("DRAT_HOME") + "/data/archive/rat/*/*.log"
rat_license = {}
rat_header = {}
for filename in glob.glob(rat_logs_dir):
l = 0
h = 0
cur_file = ''
cur_header = ''
cur_section = ''
parsedHeaders = False
parsedLicenses = False

with open(filename, 'rb') as f:
for line in f:
if '*****************************************************' in line:
l = 0
h = 0
if cur_section == 'licenses':
parsedLicenses = True
if cur_section == 'headers':
parsedHeaders = True

cur_file = ''
cur_header = ''
cur_section = ''
if line.startswith(' Files with Apache') and not parsedLicenses:
cur_section = 'licenses'
if line.startswith(' Printing headers for ') and not parsedHeaders:
cur_section = 'headers'
if cur_section == 'licenses':
l += 1
if l > 4:
line = line.strip()
if line:
#print("File: %s with License Line: %s" % (filename, line))
li = parse_license(line)
rat_license[li[0]] = li[1]
#print(li)
if cur_section == 'headers':
if '=====================================================' in line or '== File:' in line:
h += 1
if h == 2:
cur_file = line.split("/")[-1].strip()
if h == 3:
cur_header += line
if h == 4:
rat_header[cur_file] = cur_header.split("\n", 1)[1]
cur_file = ''
cur_header = ''
h = 1
if h == 3:
rat_header[cur_file] = cur_header.split("\n", 1)[1]
parsedHeaders = True
parsedLicenses = True

# Index RAT logs into Solr
connection = requests.get(os.getenv("SOLR_URL") +
"/drat/select?q=*%3A*&fl=filename%2Cfilelocation%2Cmimetype&wt=python&rows="
+ str(stats["files"]) +"&indent=true")

response = eval(connection.text)
docs = response['response']['docs']
file_data = []
batch = 100
dc = 0

for doc in docs:
fdata = {}
fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0])
m = md5.new()
m.update(fdata['id'])
hashId = m.hexdigest()
fileId = hashId+"-"+doc['filename'][0]

if fileId not in rat_license:
#print "File: "+str(fdata['id'])+": ID: ["+fileId+"] not present in parsed licenses => Likely file copying issue. Skipping."
continue #handle issue with DRAT #93

fdata["type"] = 'file'
fdata['parent'] = rep["repo"]
fdata['mimetype'] = doc['mimetype'][0]
fdata['license'] = rat_license[fileId]
if fileId in rat_header:
fdata['header'] = rat_header[fileId]
file_data.append(fdata)
dc += 1
if dc % batch == 0:
json_data = json.dumps(file_data)
index_solr(json_data)
file_data = []
if dc % batch != 0:
json_data = json.dumps(file_data)
index_solr(json_data)

# Copying data to Output Directory
print ("Notes,Binaries,Archives,Standards,Apache,Generated,Unknown")
print str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+" ,"+str(totalGenerated)+","+str(totalUnknown)

#print("\nData copied to Solr and Output Directory: OK\n")


if __name__ == "__main__":
main(sys.argv[1:])



@@ -29,6 +29,7 @@ public class FileConstants {
public static final String WORKFLOW_PATH = buildDratSubdirectoryPath("/deploy/workflow/bin/wmgr-client");
public static final String DRAT_PATH = buildDratSubdirectoryPath("/deploy/bin/drat");
public static final String DRAT_TEMP_UNZIPPED_PATH = buildDratSubdirectoryPath("/deploy/data/staging");
public static final String CURRENT_REPO_DETAILS_FILE = buildDratSubdirectoryPath("/deploy/data/repo");
public static final String DRAT_TEMP_LOG_OUTPUT = buildDratSubdirectoryPath("/deploy/data/drat_output.log");
public static final String SOLR_INDEXER_CONFIG_PATH = buildDratSubdirectoryPath("/deploy/filemgr/etc/indexer.properties");

@@ -19,5 +19,13 @@

public class DratRequestWrapper {
//needed for JSON Requests
public String dirPath;
public String id;
public String repo;
public String name;
public String loc_url;
public String description;
public String type="project";



}
@@ -17,12 +17,12 @@

package drat.proteus.rest;

import java.io.File;
import java.io.IOException;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.logging.Logger;

import com.google.gson.Gson;
import org.wicketstuff.rest.annotations.MethodMapping;
import org.wicketstuff.rest.annotations.parameters.RequestBody;
import org.wicketstuff.rest.contenthandling.json.webserialdeserial.GsonWebSerialDeserial;
@@ -48,19 +48,20 @@ public DratRestResource() {

@MethodMapping(value = "/go", httpMethod = HttpMethod.POST)
public void go(@RequestBody DratRequestWrapper body) throws Exception {
dratWrapper.setIndexablePath(body.dirPath);
dumpToFile(body);
dratWrapper.setIndexablePath(body.repo);
dratWrapper.go();
}

@MethodMapping(value = "/index", httpMethod = HttpMethod.POST)
public void index(@RequestBody DratRequestWrapper body) throws Exception {
dratWrapper.setIndexablePath(body.dirPath);
dratWrapper.setIndexablePath(body.repo);
dratWrapper.index();
}

@MethodMapping(value = "/crawl", httpMethod = HttpMethod.POST)
public void crawl(@RequestBody DratRequestWrapper body) throws Exception {
dratWrapper.setIndexablePath(body.dirPath);
dratWrapper.setIndexablePath(body.repo);
dratWrapper.crawl();
}

@@ -93,4 +94,9 @@ public String getProcessLog() {
return "Log is empty!";
}
}

public void dumpToFile(DratRequestWrapper body) throws IOException {
File repo = new File(FileConstants.CURRENT_REPO_DETAILS_FILE);
Files.write(repo.toPath(),new Gson().toJson(body).getBytes());
}
}
@@ -30,7 +30,7 @@ the License.
<v-list class="pa-0">
<v-list-tile avatar>
<v-list-tile-avatar>
<img src="https://randomuser.me/api/portraits/men/85.jpg">
<img src="./logo.png">
</v-list-tile-avatar>

<v-list-tile-content>

0 comments on commit dc59ba0

Please sign in to comment.