### Question 1

1. Using the Patentholders.csv data set, find names that match the following strings

    (a) "United States Navy"

    (b) "IBM"

    (c) "Univ. of Calif."

In [1]:
import py_stringmatching as sm 
import pandas as pd
import numpy as np 
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter
import string
import itertools
import math
from statistics import mean, median
from __future__ import division

In [2]:
# create a qgram tokenizer using q=3
qg3_tok = sm.QgramTokenizer(qval=3)

# create a whitespace tokenizer
ws_tok = sm.WhitespaceTokenizer()

# create a Jaccard similarity measure object
jac = sm.Jaccard()
lev = sm.Levenshtein()
cos = sm.Cosine()

In [3]:
#Reading the Patentholders.csv
data = pd.read_csv('Patentholders.csv', header=None, names=["string_patterns"])
data["string_patterns"] = data["string_patterns"].str.lower()
data

Unnamed: 0,string_patterns
0,signify holding bv
1,thyssenkrupp airport systems sa
2,eurenco
3,ei du pont de nemours and co
4,triatex international ag
...,...
7615,japan mathematical institute inc
7616,us departament of commerce
7617,stevens institute of technology
7618,j w medical systems ltd


In [4]:
#Reading navy ground truth
navy_df = pd.read_excel('navy.xlsx',header=None,names=['strngs'])
navy_df['strngs'] = navy_df['strngs'].str.lower()
navy_df

Unnamed: 0,strngs
0,united states department of the navy
1,"u s navy navy, secretary of"
2,untied states of america represented by secret...
3,chief of naval research office of counsel govt...
4,"navy united states, as the, secretary of"
...,...
366,"navy united states, secretary as represented by"
367,secretary of united states navy
368,navy secretary of united state
369,secretary of navy as represented by united sta...


In [5]:
#This method does basic sanitization of the strings
def sanitize_string(string_text):
    string_text = string_text.translate(string_text.maketrans('', '', string.punctuation))
    string_text = string_text.lower()
    string_text = string_text.strip()
    return string_text

In [6]:
#This method makes label = 1 if it matches the ground truth
def is_label_true(text, ground_truth):
    if text.lower() in ground_truth:
    #if (dfs['strngs'].eq(text.lower())).any():
        return 1
    else:
        return 0

In [7]:
#This method gives the similarity score given a list of synonyms, a string to match, tokenizer and sim. measure
def similarity_score(strings_to_match,str2,tokenizer,similarity_measure):
    sim = []
    for str1 in strings_to_match:
        #This removes the punctuations
        str1 = str1.translate(str1.maketrans('', '', string.punctuation)).lower()
        str2 = str2.translate(str1.maketrans('', '', string.punctuation)).lower()
        tokens1 = tokenizer.tokenize(str1)
        tokens2 = tokenizer.tokenize(str2)
        #print(tokens1)
        #print(tokens2)
        sim.append(similarity_measure.get_sim_score(tokens1,tokens2))
    return max(sim)

In [8]:
#This method matches the string and its synonyms with a dataframe given the tokenizer, sim. measure , threshold and ground truth list
def match_str_in_dataset(strings_to_match,df, tokenizer, similarity_measure, threshold, ground_truth):
    #Which similarity measure to be used ???
    if(similarity_measure=='jaccard'):
        sim_measure = sm.Jaccard()
    elif(similarity_measure=='levenshtein'):
        sim_measure = sm.Levenshtein()
    elif(similarity_measure=='cosine'):
        sim_measure = sm.Cosine()
        
    #Which tokenizer to be used ???
    if(tokenizer=='whitespace'):
        tok = sm.WhitespaceTokenizer()
    if(tokenizer=='qg3'):
        tok = sm.QgramTokenizer(qval=3)
    
    df['similarity'] =  df.apply(lambda pattern_str: 
                         similarity_score(strings_to_match,pattern_str["string_patterns"],tok,sim_measure), axis=1)
    df['true_label'] = df.apply(lambda pattern_str: is_label_true(pattern_str["string_patterns"], ground_truth), axis=1)
    df['pred_label']  = np.where(df['similarity'] >= threshold, 1, 0)
    return f1_score(df['true_label'],df['pred_label'])

In [9]:
#This method prints the results of match f1-score for all the combinations of tokenizer, similarity measure and thresholds
def string_match(string_to_match_synonyms, ground_truth):   
    #Maintaining list for f1-scores
    
    #creating a dictionary to note the combination worked the best for top f1-score
    max_f1_score_dict = {}
    
    print("\n")
    ##Try combinations of whitespace , Q-gram3 tokenizer and 3 similarity measures
    # sim_measures_list = ['jaccard','levenshtein','cosine']
    sim_measures_list = ['jaccard','cosine']
    tokenizers_list = ['whitespace','qg3']

    combination_list = [
       sim_measures_list,
       tokenizers_list
    ]
    for element in itertools.product(*combination_list):
        smt = element[0]
        tt = element[1]
        print("SIMILARITY MEASURE:",smt)
        print("TOKENIZER :",tt)
        header = "{:<15} {:<10}".format('threshold', 'f1-score')
        print('-' * len(header))
        print(header)
        print('-' * len(header))
        f1_score_list = []
        for i in np.arange(0.1, 1.0, 0.1):
            f1score = round(match_str_in_dataset(string_to_match_synonyms, data, element[1], element[0], i, ground_truth ),2)
            f1_score_list.append(f1score)
            print("{:<15} {:<10}".format(round(i,1),f1score))
            max_f1_score_dict["similarity measure="+smt+", tokenizer="+tt+" and threshold="+str(round(i,1))] = f1score
        print("F1-score:")
        print("max: ", max(f1_score_list))
        
        print('-' * len(header)) 
    print('-' * len(header)*3) 
    f1_max = max(max_f1_score_dict, key=max_f1_score_dict.get)
    print("For string:",string_to_match_synonyms)
    print("Maximum [[[f1-score value: {0}]]]  with dist/tokenizer combination : {1}".format(max_f1_score_dict[f1_max],f1_max))
    #print(max_f1_score_dict)
    print('-' * len(header)*3) 

#### United States Navy

In [10]:
ground_truth = navy_df['strngs'].tolist()
ground_truth = list(map(lambda x: x.lower(), ground_truth))
string_match(["United States Navy"], ground_truth)



SIMILARITY MEASURE: jaccard
TOKENIZER : whitespace
--------------------------
threshold       f1-score  
--------------------------
0.1             0.87      
0.2             0.86      
0.3             0.8       
0.4             0.5       
0.5             0.3       
0.6             0.09      
0.7             0.0       
0.8             0.0       
0.9             0.0       
F1-score:
max:  0.87
--------------------------
SIMILARITY MEASURE: jaccard
TOKENIZER : qg3
--------------------------
threshold       f1-score  
--------------------------
0.1             0.8       
0.2             0.84      
0.3             0.5       
0.4             0.12      
0.5             0.03      
0.6             0.0       
0.7             0.0       
0.8             0.0       
0.9             0.0       
F1-score:
max:  0.84
--------------------------
SIMILARITY MEASURE: cosine
TOKENIZER : whitespace
--------------------------
threshold       f1-score  
--------------------------
0.1             0.87      
0

In [11]:
ground_truth = ["International Business Machines Corp", "IBM Canada Ltd"]
ground_truth = list(map(lambda x: x.lower(), ground_truth))
string_match(string_to_match_synonyms=["IBM","International Business Machines"],ground_truth=ground_truth)




SIMILARITY MEASURE: jaccard
TOKENIZER : whitespace
--------------------------
threshold       f1-score  
--------------------------
0.1             0.03      
0.2             0.06      
0.3             1.0       
0.4             0.67      
0.5             0.67      
0.6             0.67      
0.7             0.67      
0.8             0.0       
0.9             0.0       
F1-score:
max:  1.0
--------------------------
SIMILARITY MEASURE: jaccard
TOKENIZER : qg3
--------------------------
threshold       f1-score  
--------------------------
0.1             0.01      
0.2             0.02      
0.3             0.22      
0.4             0.67      
0.5             0.67      
0.6             0.67      
0.7             0.67      
0.8             0.0       
0.9             0.0       
F1-score:
max:  0.67
--------------------------
SIMILARITY MEASURE: cosine
TOKENIZER : whitespace
--------------------------
threshold       f1-score  
--------------------------
0.1             0.02      
0.

In [12]:
ground_truth = ["University of California", "University of California Berkeley", "CALIFORNIA A CORP OF, University of, Regents of","University of Southern California USC"]
ground_truth = list(map(lambda x: x.lower(), ground_truth))
string_match(string_to_match_synonyms=["Univ. of Calif.","University Of California"],ground_truth=ground_truth)




SIMILARITY MEASURE: jaccard
TOKENIZER : whitespace
--------------------------
threshold       f1-score  
--------------------------
0.1             0.01      
0.2             0.04      
0.3             0.08      
0.4             0.1       
0.5             0.17      
0.6             0.86      
0.7             0.67      
0.8             0.4       
0.9             0.4       
F1-score:
max:  0.86
--------------------------
SIMILARITY MEASURE: jaccard
TOKENIZER : qg3
--------------------------
threshold       f1-score  
--------------------------
0.1             0.02      
0.2             0.04      
0.3             0.12      
0.4             0.24      
0.5             0.86      
0.6             0.67      
0.7             0.4       
0.8             0.4       
0.9             0.4       
F1-score:
max:  0.86
--------------------------
SIMILARITY MEASURE: cosine
TOKENIZER : whitespace
--------------------------
threshold       f1-score  
--------------------------
0.1             0.01      
0

## Question 2


*2. Using organization names from Patentholders.csv that have the string "Inc" (i.e., incorporated) find similar values in the orgNames.csv data set. Your output should be a table where the first column is a name from Patentholders.csv and the second column is a list of matching values from orgNames.csv.*

In [13]:
#Threshold. This threshold is picked from previous experience and can be changed to get different results
t = 0.6

#This flag is for taking small data for test. toy_run=False suggest that we want to run full data.
toy_run = False

#For now I am trying whitepace tokenizer, we can check other methods. 
#But I am using ws since it gaave the optimum results
tokenizer = 'whitespace'

#Which tokenizer to be used ???
if(tokenizer=='whitespace'):
    tok = sm.WhitespaceTokenizer()
elif(tokenizer=='qg3'):
    tok = sm.QgramTokenizer(qval=3)
    
    
#Using jaccard similarity measure 
jac = sm.Jaccard()

In [14]:
orgs_data = pd.read_csv('orgNames.csv', header=None, names=["org_names_og"])
orgs_data = orgs_data.dropna()
orgs_data["org_names"] = orgs_data["org_names_og"].apply(lambda x:
                                                               sanitize_string(x))

orgs_data['org_tokens'] = orgs_data['org_names'].apply(lambda x: tok.tokenize(x))

#append token size
orgs_data['size'] = orgs_data['org_tokens'].apply(lambda x: len(x))

#filter out numeric strings from org_names
#filter_values = np.where(not (orgs_data['org_names'].isnumeric()))
orgs_data = orgs_data[orgs_data.apply(lambda x: not x['org_names'].isnumeric(), axis=1)]
display(orgs_data)

Unnamed: 0,org_names_og,org_names,org_tokens,size
0,00 jackson state,00 jackson state,"[00, jackson, state]",3
2,01 trump,01 trump,"[01, trump]",2
7,02 father of merced,02 father of merced,"[02, father, of, merced]",4
8,02 meet,02 meet,"[02, meet]",2
12,03 fired paso robles high,03 fired paso robles high,"[03, fired, paso, robles, high]",5
...,...,...,...,...
443534,zyvex labs llc,zyvex labs llc,"[zyvex, labs, llc]",3
443535,zzazz productions,zzazz productions,"[zzazz, productions]",2
443536,zzk records,zzk records,"[zzk, records]",2
443537,zz top,zz top,"[zz, top]",2


In [15]:
inc_data = pd.read_csv('Patentholders.csv', header=None, names=["string_patterns_og"])

if(toy_run):
    inc_data = inc_data[:100]

#sanitize the string
inc_data["string_patterns"] = inc_data["string_patterns_og"].apply(lambda x:
                                                               sanitize_string(x))

#check if string contains " inc"
inc_data = inc_data[inc_data['string_patterns'].str.contains(' inc')]

#remove the punctuations
inc_data['string_patterns'] =  inc_data['string_patterns'].apply(lambda x:
                                              x.translate(x.maketrans('', '', string.punctuation)))

#drop duplicates
inc_data = inc_data.drop_duplicates()

#Which tokenizer to be used ???
if(tokenizer=='ws'):
    tok = sm.WhitespaceTokenizer()
elif(tokenizer=='qg3'):
    tok = sm.QgramTokenizer(qval=3)

#Keeping the tokens ready so that later I don't do computation
inc_data['tokens'] = inc_data["string_patterns"].apply(lambda x: tok.tokenize(x))

#Adding lower and upper bound of the size
inc_data['lower_size'] = inc_data["tokens"].apply(lambda x: math.ceil(t*len(x)))
inc_data['upper_size'] = inc_data["tokens"].apply(lambda x: math.ceil(1/t*len(x)))

display(inc_data)

Unnamed: 0,string_patterns_og,string_patterns,tokens,lower_size,upper_size
9,MACOM Technology Solutions Holdings Inc,macom technology solutions holdings inc,"[macom, technology, solutions, holdings, inc]",3,9
11,CONFIDENT TECHNOLOGIES Inc,confident technologies inc,"[confident, technologies, inc]",2,5
13,Bitsysoft Technology Inc,bitsysoft technology inc,"[bitsysoft, technology, inc]",2,5
17,Specialty Manufacturing Inc,specialty manufacturing inc,"[specialty, manufacturing, inc]",2,5
18,Bell Helicopter Michigan Inc,bell helicopter michigan inc,"[bell, helicopter, michigan, inc]",3,7
19,Autodesk Inc,autodesk inc,"[autodesk, inc]",2,4
26,METRETEK Inc A FL CORP,metretek inc a fl corp,"[metretek, inc, a, fl, corp]",3,9
29,Alert Logic Inc,alert logic inc,"[alert, logic, inc]",2,5
33,BOOZ-ALLEN HAMILTON Inc,boozallen hamilton inc,"[boozallen, hamilton, inc]",2,5
36,AFX Tech Group International Inc,afx tech group international inc,"[afx, tech, group, international, inc]",3,9


In [16]:
#To save final file , creating a placeholder Dataframe
df_to_save = pd.DataFrame()

for idx in range(0,len(inc_data)):
    
    #Taking the record from inc patent holders
    inc_data_row = inc_data.iloc[idx]
    
    filtered_orgs_data = pd.DataFrame()
    
    #Apply the constraint and remove records which dont fulfill the size constraint
    filtered_orgs_data = orgs_data[((orgs_data['size'] >= inc_data_row['lower_size']) & (orgs_data['size'] <= inc_data_row['upper_size']))].copy()
    filtered_orgs_data['jac_score'] = filtered_orgs_data['org_tokens'].apply(lambda inc_tokens: jac.get_sim_score(inc_data_row['tokens'],inc_tokens))

    filtered_orgs_data = filtered_orgs_data.loc[filtered_orgs_data['jac_score'] > t]
    #display(filtered_orgs_data)
    
    #display(inc_data_row)
    #print(filtered_orgs_data['org_names_og'].values.tolist())
    
    matching_names = filtered_orgs_data['org_names_og'].values.tolist()
    
    df_to_save = df_to_save.append({'matching_org_names':matching_names,'inc_strings_og':inc_data_row['string_patterns_og']},ignore_index=True)

df_to_save


Unnamed: 0,inc_strings_og,matching_org_names
0,MACOM Technology Solutions Holdings Inc,"[macom technology solutions holdings inc, maco..."
1,CONFIDENT TECHNOLOGIES Inc,"[confident technologies inc, technologies inc..."
2,Bitsysoft Technology Inc,"[ technology inc, technology, inc]"
3,Specialty Manufacturing Inc,"[ manufacturing inc, manufacturing, inc, spec..."
4,Bell Helicopter Michigan Inc,[bell helicopter michigan inc]
5,Autodesk Inc,"[autodesk inc, autodesk, inc]"
6,METRETEK Inc A FL CORP,[metretek inc a fl corp]
7,Alert Logic Inc,[alert logic inc]
8,BOOZ-ALLEN HAMILTON Inc,"[booz-allen and hamilton inc, booz-allen hamil..."
9,AFX Tech Group International Inc,[afx tech group international inc]


In [18]:
#Saving the final file with below format
# inc_string_from_patent_holder,[matching_org_name1,matching_org_name2,matching_org_name3.....]
df_to_save.to_csv('inc_matching_'+tokenizer+'.csv', index=False)