## Part - c

In [1]:
import psycopg2
import pandas as pd
import re

In [2]:
# import requests module
import requests
  
import json

# Making a get request
response = requests.get('https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=radiation%20hardness&utf8=&format=json')

json_str = response.json()


json_object = json.dumps(json_str, indent = 4) 
#print(json_object)

#type(json_str)

# parse x:
y = json.loads(json_object)

# print response
print(y['query']['search'][0]['title'])
#json_str
  

Radiation hardening


In [3]:
def get_json_snippet(url):
    response = requests.get(url)
    json_str = response.json()
    return json_str

In [4]:
#!pip install rake_nltk

In [5]:
#Important to download punkt before using Rake
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/amolkale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
from rake_nltk import Rake
from rake_nltk import Metric
r = Rake(max_length=4, min_length=2, ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO, include_repeated_phrases=False)

In [7]:
import nltk; 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amolkale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def getPSQLConn():
    return psycopg2.connect(database="dse203",
                        host="localhost",
                        user="postgres",
                        password="admin",
                        port="5432")

In [9]:
conn = getPSQLConn()
cursor = conn.cursor()

In [10]:
#Choosing company name "Department of Defense - Army" for key phrases
companyName = "Department of Defense - Army"

sql = '''select "Abstract" from (select case
		   when "Branch" is null then "Agency"
		   else "Agency" || ' - ' || "Branch"
		   end as funder, "Abstract"
        FROM award_data) as f_a
        WHERE "Abstract" like '%radiation hardness%' 
        AND funder = \''''+companyName+"\'"
cursor.execute(sql)
result= cursor.fetchall()
conn.close()

In [11]:
#Data pruning
data = [str(t[0]) for t in result]

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', str(sent)) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", str(sent)) for sent in data]

# Remove distracting pesky characters
data = [re.sub("\“", "", str(sent)) for sent in data]
data = [re.sub("\)", "", str(sent)) for sent in data]
data = [re.sub("\-", "", str(sent)) for sent in data]
data = [re.sub("[0-9]+", "", str(sent)) for sent in data]
data = [re.sub("\–", "", str(sent)) for sent in data]

In [12]:
#Extracting key words from all the Abstract texts
r.extract_keywords_from_sentences(data)


In [13]:
#Get the key phrases along with the ranked scores.
score_key_phrase =  r.get_ranked_phrases_with_scores()

In [14]:
#Displaying top 10 key phrases with score
score_key_phrase[:10]

[(16.0, 'crosslinked liquid crystal polymer'),
 (16.0, 'available cornerstone research group'),
 (15.666666666666666, 'jamming level laser threat'),
 (15.5, 'nanoscale magnetic tunnel junctions'),
 (15.5, 'achieved using recent advances'),
 (14.75, 'highperformance rejection filter centered'),
 (14.666666666666666, 'vehicleprotection tasks demand ultrafast'),
 (14.166666666666666, 'single crystal zno substrates'),
 (14.0, 'microsecond time scale measurements'),
 (14.0, 'host different army waveforms')]

In [15]:
#Using threshold of score > 8.0 to pick important phrases.
key_phrases = [kp[1] for kp in score_key_phrase if kp[0] > 8.0]

In [16]:
#Displaying top 10 key phrases
key_phrases[:10]

['crosslinked liquid crystal polymer',
 'available cornerstone research group',
 'jamming level laser threat',
 'nanoscale magnetic tunnel junctions',
 'achieved using recent advances',
 'highperformance rejection filter centered',
 'vehicleprotection tasks demand ultrafast',
 'single crystal zno substrates',
 'microsecond time scale measurements',
 'host different army waveforms']

In [17]:
#This routine creates a wikipedia url to be used using key phrases.
def create_wikipedia_url(text):
    text = text.replace(" ", "%20")
    url = 'https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={0}&utf8=&format=json'.format(text)
    return url

In [18]:
output_df = pd.DataFrame()
for txt in key_phrases:
    #print(txt)
    url = create_wikipedia_url(txt)
    full_json = get_json_snippet(url)
    json_object = json.dumps(full_json, indent = 4) 
    y = json.loads(json_object)
    
    try:
        title = y['query']['search'][0]['title']
        snippet =  y['query']['search'][0]['snippet']
    except IndexError:
        title = None
        #print("Index Error")
    except:
        print("Something else went wrong")
    output_df = output_df.append({'CompanyName':companyName,'Keyphrase':txt,'Wikipedia title':title,'Wikipedia snippet':snippet}, ignore_index=True)
    

In [19]:
#Displaying the final ouput.
output_df

Unnamed: 0,CompanyName,Keyphrase,Wikipedia title,Wikipedia snippet
0,Department of Defense - Army,crosslinked liquid crystal polymer,Liquid-crystal polymer,"<span class=""searchmatch"">Liquid</span> <span ..."
1,Department of Defense - Army,available cornerstone research group,Berkeley Research Group,"Analysis <span class=""searchmatch"">Group</span..."
2,Department of Defense - Army,jamming level laser threat,USSS Electronic Countermeasures Suburban,"barrage <span class=""searchmatch"">jamming</spa..."
3,Department of Defense - Army,nanoscale magnetic tunnel junctions,Nanotechnology,therefore common to see the plural form &quot;...
4,Department of Defense - Army,achieved using recent advances,Recent human evolution,"twenty-first-century <span class=""searchmatch""..."
...,...,...,...,...
57,Department of Defense - Army,efficient light emitter,OLED,"An organic <span class=""searchmatch"">light</sp..."
58,Department of Defense - Army,current materials used,Magnet,several other types of magnetism. Ferromagneti...
59,Department of Defense - Army,lowpower microwave signals,IEEE Kiyo Tomiyasu Award,"contributions to nano-materials, devices, circ..."
60,Department of Defense - Army,process technology necessary,Open Platform Communications,"reflects the applications of OPC <span class=""..."
