Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
Merge pull request #93 from karanjeets/drat-ontosoft
Fixed issues and added a Visual Interface
  • Loading branch information
chrismattmann committed Jul 18, 2017
2 parents d20ac0b + ca77ed1 commit 1c995942028f29b0a95a73ce5f2f81d083d5db23
Showing 159 changed files with 94,562 additions and 46 deletions.
@@ -148,7 +148,8 @@ function check_port {
# Commented 'lsof' - Using 'nc' which helps to check port on Remote server as well
# lsof -i tcp:$1
nc -z ${OODT_HOST} $1 &> /dev/null
#nc -z ${OODT_HOST} $1 &> /dev/null
netstat -nlp | grep $1 &> /dev/null
if [ $? == 0 ]; then
echo 0
else
@@ -30,6 +30,7 @@
import json
import xmlrpclib
import getopt
import glob

# Check for environment variables
def check_env_var():
@@ -89,34 +90,34 @@ def printnow(string):

# Parsing RAT log files
def parseFile(filepath):
f = open(filepath, 'r')
lines = f.readlines()
notes = 0
binaries = 0
archives = 0
standards = 0
apachelicensed = 0
generated = 0
unknown = 0

for line in lines:
if line.startswith('Notes:'):
notes = notes + int(line.split(':')[1].strip())
if line.startswith('Binaries:'):
binaries = binaries + int(line.split(':')[1].strip())
if line.startswith('Archives:'):
archives = archives + int(line.split(':')[1].strip())
if line.startswith('Standards:'):
standards = standards + int(line.split(':')[1].strip())
if line.startswith('Apache Licensed:'):
apachelicensed = apachelicensed + int(line.split(':')[1].strip())
if line.startswith('Generated:'):
generated = generated + int(line.split(':')[1].strip())
if line.find('Unknown Licenses') != -1:
unknown = unknown + int(line.split(' ')[0].strip())
return (notes, binaries,archives,standards,apachelicensed,generated,unknown)
return (-1,-1,-1,-1,-1,-1,-1)
f = open(filepath, 'r')
lines = f.readlines()
notes = 0
binaries = 0
archives = 0
standards = 0
apachelicensed = 0
generated = 0
unknown = 0

for line in lines:
if line.startswith('Notes:'):
notes = notes + int(line.split(':')[1].strip())
if line.startswith('Binaries:'):
binaries = binaries + int(line.split(':')[1].strip())
if line.startswith('Archives:'):
archives = archives + int(line.split(':')[1].strip())
if line.startswith('Standards:'):
standards = standards + int(line.split(':')[1].strip())
if line.startswith('Apache Licensed:'):
apachelicensed = apachelicensed + int(line.split(':')[1].strip())
if line.startswith('Generated:'):
generated = generated + int(line.split(':')[1].strip())
if line.find('Unknown Licenses') != -1:
unknown = unknown + int(line.split(' ')[0].strip())
return (notes, binaries,archives,standards,apachelicensed,generated,unknown)

return (-1,-1,-1,-1,-1,-1,-1)


# OODT Process (start, stop)
@@ -139,7 +140,9 @@ def drat_process(command, repository):
retval = True
try:
retcode = 0
if command == "crawl" or command == "index":
if command == "crawl":
retcode = subprocess.call("${DRAT_HOME}/bin/drat" + " " + command + " --exclude \"\\.git\" " + repository, shell=True)
elif command == "index":
retcode = subprocess.call("${DRAT_HOME}/bin/drat" + " " + command + " " + repository, shell=True)
elif command == "map" or command == "reduce":
retcode = subprocess.call("nohup ${DRAT_HOME}/bin/drat" + " " + command + " &", shell=True)
@@ -194,11 +197,32 @@ def wait_for_job(job_name):
time.sleep(2)


# Parse license from RAT
def parse_license(s):
li_dict = {'N': 'Notes', 'B': 'Binaries', 'A': 'Archives', 'AL': 'Apache', '!?????': 'Unknown'}
arr = s.split("/", 1)
li = arr[0].strip()
if li in li_dict:
li = li_dict[li]
return [arr[1].split("/")[-1].strip().replace("_|_", "/"), li]


# Index into Solr
def index_solr(json_data):
printnow(json_data)
request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
urllib2.urlopen(request, json_data)


# Run DRAT and collect statistics
def run(repos_list, output_dir):
with open(repos_list) as repositories:
for repository in repositories:
repository = repository.strip()
if repository.startswith('#'):
print('\nSkipping Repository: ' + repository[1:])
continue
printnow ("\nVerifying repository path...\n")
if not os.path.exists(repository):
printnow ("\nPath " + repository + "is not valid. Skipping and moving on...\n")
@@ -292,16 +316,89 @@ def run(repos_list, output_dir):


# Count the number of files
stats["files"] = count_num_files(repository, ".svn")
stats["files"] = count_num_files(repository, ".git")

# Write data into Solr
stats["type"] = 'software'
stats_data = []
stats_data.append(stats)
json_data = json.dumps(stats_data)
printnow (json_data)
request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
urllib2.urlopen(request, json_data)
index_solr(json_data)

# Parse RAT logs
rat_logs_dir = os.getenv("DRAT_HOME") + "/data/archive/rat/*/*.log"
rat_license = {}
rat_header = {}
for filename in glob.glob(rat_logs_dir):
#print('=' * 20)
l = 0
h = 0
cur_file = ''
cur_header = ''
cur_section = ''
with open(filename, 'rb') as f:
for line in f:
if '*****************************************************' in line:
l = 0
h = 0
cur_file = ''
cur_header = ''
cur_section = ''
if line.startswith(' Files with Apache'):
cur_section = 'licenses'
if line.startswith(' Printing headers for '):
cur_section = 'headers'
if cur_section == 'licenses':
l += 1
if l > 4:
line = line.strip()
if line:
# print("File: %s with License Line: %s" % (filename, line))
li = parse_license(line)
rat_license[li[0]] = li[1]
# print(li)
if cur_section == 'headers':
if '=====================================================' in line or '== File:' in line:
h += 1
if h == 2:
cur_file = line.split("/")[-1].strip().replace("_|_", "/")
if h == 3:
cur_header += line
if h == 4:
rat_header[cur_file] = cur_header.split("\n", 1)[1]
cur_file = ''
cur_header = ''
h = 1
if h == 3:
rat_header[cur_file] = cur_header.split("\n", 1)[1]

# Index RAT logs into Solr
connection = urllib2.urlopen(os.getenv("SOLR_URL") +
"/drat/select?q=*%3A*&fl=filename%2Cfilelocation%2Cmimetype&wt=python&rows="
+ str(stats["files"]) +"&indent=true")
response = eval(connection.read())
docs = response['response']['docs']
file_data = []
batch = 100
dc = 0
for doc in docs:
fdata = {}
fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0])
fdata["type"] = 'file'
fdata['parent'] = repository
fdata['mimetype'] = doc['mimetype'][0]
fdata['license'] = rat_license[fdata['id']]
if fdata['id'] in rat_header:
fdata['header'] = rat_header[fdata['id']]
file_data.append(fdata)
dc += 1
if dc % batch == 0:
json_data = json.dumps(file_data)
index_solr(json_data)
file_data = []
if dc % batch != 0:
json_data = json.dumps(file_data)
index_solr(json_data)

# Copying data to Output Directory
repos_out = output_dir + "/" + normalize_path(repository)
@@ -74,10 +74,23 @@ def executeRatJobs(url, num, type, workflowUrl, taskIds):
if "InputFiles" not in metadata:
metadata["InputFiles"] = []
metadata["InputFiles"].append(fullpath)
metadata["InputFiles"] = "_|_".join(str(item) for item in metadata["InputFiles"])
print "Metadata is "+str(metadata)
wm.workflowmgr.executeDynamicWorkflow([taskIds], metadata)


def get_mime_types(solrUrl):
neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
connection = urllib2.urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
response = eval(connection.read())
mime_count = response["facet_counts"]["facet_fields"]["mimetype"]
stats = {}
for i in range(0, len(mime_count), 2):
if mime_count[i].split("/")[0] not in neg_mimetype:
stats[mime_count[i]] = mime_count[i + 1]
return stats.keys()


def main(argv):
solrUrl=''
numFilesPerJob=0
@@ -109,9 +122,11 @@ def main(argv):


print "Configured SOLR url: ["+solrUrl+"]"
mimeTypes = ["x-java-source", "x-c", "javascript", "xml", "html", "css", \
"x-json", "x-sh", "x-fortran", "csv" "tab-separated-values", "x-tex", \
"x-asm", "x-diff", "x-python", "x-matlab"]
mimeTypes = get_mime_types(solrUrl)
#mimeTypes = ["x-java-source", "x-c", "javascript", "xml", "html", "css", \
#"x-json", "x-sh", "x-fortran", "csv" "tab-separated-values", "x-tex", \
#"x-asm", "x-diff", "x-python", "x-matlab"]

for type in mimeTypes:
print "Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]"
executeRatJobs(solrUrl, numFilesPerJob, type, workflowUrl, ratTaskId)
@@ -0,0 +1,14 @@
FILELIST=${1}
FSEP=${2}
JOB_INPUT_DIR=${3}
sList=($(echo $FILELIST | sed -e 's/'"$FSEP"'/\n/g' | while read line; do echo $line | sed 's/[\t ]/'"$FSEP"'/g'; done))
for (( i = 0; i < ${#sList[@]}; i++ )); do sList[i]=$(echo ${sList[i]} | sed 's/'"$FSEP"'/ /g'); done
for (( i = 0; i < ${#sList[@]}; i++ )); do
echo ${sList[i]};
#file=`printf '%q' "${sList[i]}"`
file="${sList[i]}"
newFile=`echo ${file} | sed 's/\//_|_/g'`
echo "From: $file"
echo "To: $newFile"
rsync -av --backup --suffix=_`date +"%m%d%Y_%H%M"` "${file}" "${JOB_INPUT_DIR}/${newFile}"
done
@@ -10,10 +10,9 @@
<cmd>echo "Creating working dirs"</cmd>
<cmd>mkdir [JobInputDir] ; mkdir [JobOutputDir]; mkdir [JobLogDir]</cmd>
<cmd>echo "Staging input to [JobInputDir]"</cmd>
<cmd>FILELIST=`python -c "print ' '.join('[InputFiles]'.split(','))"`</cmd>
<cmd>for file in $FILELIST; do</cmd>
<cmd>rsync -av --backup --suffix=_`date +"%m%d%Y_%H%M"` $file [JobInputDir]</cmd>
<cmd>done</cmd>
<cmd>FILELIST="[InputFiles]"</cmd>
<cmd>FSEP="_|_"</cmd>
<cmd>bash [DRAT_HOME]/pge/bin/rat_audit/copy_files.sh "${FILELIST}" "${FSEP}" "[JobInputDir]"</cmd>
<cmd>echo "Running Apache RAT on [JobInputDir]"</cmd>
<cmd>rat [JobInputDir] > [JobOutputDir]/rat_[MimeType]_[DateMilis].log</cmd>
</exe>
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
@@ -496,6 +496,7 @@

<!-- core CAS product attributes -->
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="type" type="string" indexed="true" stored="true" />
<field name="crawl_start" type="date" indexed="true" stored="true" />
<field name="crawl_end" type="date" indexed="true" stored="true" />
<field name="index_start" type="date" indexed="true" stored="true" />
@@ -507,8 +508,19 @@
<field name="files" type="long" indexed="true" stored="true" />
<dynamicField name="license_*" type="long" indexed="true" stored="true" />
<dynamicField name="mime_*" type="long" indexed="true" stored="true" />

<!-- <field name="text" type="text_general" indexed="true" stored="true" required="false" multiValued="true"/> -->

<field name="parent" type="string" indexed="true" stored="true" />
<field name="mimetype" type="string" indexed="true" stored="true" />
<field name="license" type="string" indexed="true" stored="true" />
<field name="header" type="text_general" indexed="true" stored="true" />

<field name="name" type="text_general" indexed="true" stored="true" />
<field name="description" type="text_general" indexed="true" stored="true" />
<field name="loc_url" type="string" indexed="true" stored="true" />
<field name="drat_id" type="string" indexed="true" stored="true" />
<field name="repo" type="string" indexed="true" stored="true" />

<field name="text" type="text_general" indexed="true" required="false" multiValued="true"/>

<!-- all other fields are indexed and stored as-is and can have multiple values -->
<!-- <dynamicField name="*" type="string" indexed="true" stored="true" omitNorms="true" multiValued="true" /> -->
@@ -528,6 +540,7 @@
or to add multiple fields to the same field for easier/faster searching. -->

<!-- catch-all text fields for full free-text query -->
<!-- <copyField source="*" dest="text" /> -->
<copyField source="name" dest="text" />
<copyField source="description" dest="text" />

</schema>
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.