In [1]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import warnings
import model_report as mr
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
file_info = "C:\\Users\\blgai\\OneDrive\\Documents\\School\\SMU\\Courses\\Fall 2021\\Capstone A\Data\\cleaned_chunked_v2.csv"
df = pd.read_csv(file_info, sep='\t')

In [3]:
#balance classes
g = df.groupby('category')
df = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))

In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,url,code,category,language,title,summary,key_words,content,combined,nouns,compounds,comp_nouns,flat_comp_nouns,clean_comp_nouns
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
bu,0,*://ATS-SERVER.DE,200,bu,en,"ADAMCO INC - Houston, Texas",Adamco Technology Services is providing Consul...,"HIGH STRENGTH, ALUMINUM alloys, Aeronautics, A...",Home US site Contact Welcome to ADAMCO Technol...,"ADAMCO INC - Houston, Texas Adamco Technology ...","[('ADAMCO', 0, 6, 'PROPN'), ('INC', 7, 10, 'PR...","[('ADAMCO INC -', 0, 12, 'COMPOUND'), ('Texas ...","{'modulus', 'Rights Reserved', 'ADAMCO Technol...","{'modulus', 'Rights Reserved', 'ADAMCO Technol...",modulus rights reserved adamco technology serv...
bu,1,*://feanalytics.com,200,bu,en,FE Analytics | Online Fund Research Tool,An award winning online financial planning too...,"fe analytics, fund research, financial plannin...",FE Analytics Login Home 10 Years Features Abou...,FE Analytics | Online Fund Research Tool An aw...,"[('FE', 0, 2, 'PROPN'), ('Analytics', 3, 12, '...","[('FE Analytics |', 0, 14, 'COMPOUND'), ('Onli...","{'Years', 'FE', 'Testimonials Testimonials Cas...","{'Years', 'FE', 'Testimonials Testimonials Cas...",years fe testimonial testimonials case studies...
bu,2,*://SEARECOVERY.COM,200,bu,en,Sea Recovery Global - Welcome,Sea Recovery is a global provider of marine an...,"marine watermakers, reverse osmosis systems, w...",Welcome to Sea Recovery Global | The World's L...,Sea Recovery Global - Welcome Sea Recovery is ...,"[('Sea', 0, 3, 'PROPN'), ('Recovery', 4, 12, '...","[('Recovery Global -', 0, 17, 'COMPOUND'), ('G...","{'Ideal', 'CAPACITY', 'boat', 'Tons', 'World',...","{'Ideal', 'CAPACITY', 'boat', 'Tons', 'World',...",ideal capacity boat tons world watermaker mega...
bu,3,*://WWW.DTIDC.IN,200,bu,en,Delhi Transport Infrastructure Development Cor...,Delhi Transport Infrastructure Development Cor...,"Dtidc, DTIDC",Our Services Interstate Bus Terminals Bus Que ...,Delhi Transport Infrastructure Development Cor...,"[('Delhi', 0, 5, 'PROPN'), ('Transport', 6, 15...",[('Delhi Transport Infrastructure Development ...,"{'Team', 'Maharana', 'stands', 'e', 'Executive...","{'Team', 'Maharana', 'stands', 'e', 'Executive...",team maharana stand e executive flow kashmiri ...
bu,4,*://CUSTOMSEALANDRUBBER.COM,200,bu,en,Welcome to Custom Seal and Rubber Products! | ...,Custom Seal and Rubber Products specializes in...,"poly, seal, rubber, polyurethane, custom, prod...",Navigation Home Molding Polyurethane Silicone ...,Welcome to Custom Seal and Rubber Products! | ...,"[('Custom', 11, 17, 'PROPN'), ('Seal', 18, 22,...","[('Custom Seal', 11, 22, 'COMPOUND'), ('Rubber...","{'Flexibility', 'precision', 'die cut', 'Syste...","{'Flexibility', 'precision', 'die cut', 'Syste...",flexibility precision die cut systems register...


In [5]:
df.groupby(df.category).size()

category
bu    1595
dr    1595
ed    1595
mk    1595
os    1595
sp    1595
sx    1595
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
#get 10% holdout set
train_corpus, holdout_corpus, train_label, holdout_label = train_test_split(df['clean_comp_nouns'], 
                                                                           df['category'],
                                                                           test_size=.10,random_state=1234)

#get 80% train and 20% test sets
train_corpus, test_corpus, train_label, test_label = train_test_split(train_corpus,
                                                                      train_label,
                                                                      test_size=.2225,random_state=1234) #.9 * .2225 = .20

train_corpus.shape, test_corpus.shape, holdout_corpus.shape

((7812,), (2236,), (1117,))

In [7]:
df_train = pd.DataFrame(zip(train_corpus,train_label),columns=["clean_comp_nouns","label"])
df_test = pd.DataFrame(zip(test_corpus,test_label),columns=["clean_comp_nouns","label"])
df_holdout = pd.DataFrame(zip(holdout_corpus,holdout_label),columns=["clean_comp_nouns","label"])

In [8]:
df_train.head()

Unnamed: 0,clean_comp_nouns,label
0,resource world social industry network target ...,bu
1,baby vertical mount bed vegetables flowers org...,os
2,address faq check email address cancel post pr...,mk
3,rohre germany sales beteiligten von produktion...,bu
4,industry robert f kennedy tar sand robert home...,bu


In [9]:
#Number of Permutations
permutations = 256


In [10]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [11]:
def get_forest(data,perms):
    start_time = time.time()
    minhash = []
    for text in data['clean_comp_nouns']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
    
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [12]:
def get_cosine(list1,list2):
    from collections import Counter

    # count word occurrences
    a_vals = Counter(list1)
    b_vals = Counter(list2)

    # convert to word-vectors
    words  = list(a_vals.keys() | b_vals.keys())
    a_vect = [a_vals.get(word, 0) for word in words]        
    b_vect = [b_vals.get(word, 0) for word in words]        

    # find cosine
    len_a  = sum(av*av for av in a_vect) ** 0.5             
    len_b  = sum(bv*bv for bv in b_vect) ** 0.5             
    dot    = sum(av*bv for av,bv in zip(a_vect, b_vect))    
    cosine = dot / (len_a * len_b)                          
    
    return cosine

In [13]:
def get_similars(test_item, database, perms, num_results, forest):
        
    tokens = preprocess(test_item)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
    
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None #if query is empty, return none
    
    #label = database.iloc[idx_array]['label']
    #text = database.iloc[idx_array]['clean_comp_nouns']
    df_results = pd.DataFrame(database.iloc[idx_array][['clean_comp_nouns','label']])
    
    return df_results

In [14]:
def predict(test_item, database, perms, num_results, forest):
    df_pred = get_similars(test_item,df_train,permutations,num_similars,forest)
    df_pred['cos_dist'] = df_pred.apply(lambda x: get_cosine(preprocess(x['clean_comp_nouns']),text),axis=1)
    
    return df_pred.sort_values('cos_dist',ascending=False)
    

In [15]:
def predict_one(test_item, database, perms, num_results, forest):
    df_pred_one = predict(test_item, database, perms, num_results, forest)
    #get most similar item based on cosine distance
    cat = None
    try:
        cat = df_pred_one.iloc[1, 1]
    except:
        cat = 'ed'
    
    return cat
    

In [49]:
def predict_mod(test_item, database, perms, num_results, forest):
    label = None
    try:
        df_pred_mod = get_similars(test_item,df_train,permutations,num_similars,forest)
        label = df_pred_mod['label'].value_counts()[:1].index.tolist()[0]
    except:
        return None
    return label

In [16]:
#build lsh forest using training data
forest = get_forest(df_train,permutations)

It took 62.27829074859619 seconds to build forest.


In [58]:
#Number of similars to return
num_similars = 50
test_item = 5
text = df_test['clean_comp_nouns'][test_item]
#query the forest
df_predict = predict(text,df_train,permutations,num_similars,forest)
print('actual label: ',df_test['label'][test_item])
print('mod similar: ',predict_mod(text,df_train,permutations,num_similars,forest))
print('closest similar: ',predict_one(text,df_train,permutations,num_similars,forest))
print('All similars: ',predict(text,df_train,permutations,num_similars,forest))
text

actual label:  ed
mod similar:  dr
closest similar:  dr
All similars:                                         clean_comp_nouns label  cos_dist
2386  smartpac smartpacglandpackings nd stem rings w...    bu  0.109114
342   un prochain eveneman faq montreal dans votre c...    dr  0.039704
176   space function motility citrus aurantium extra...    dr  0.033715
1106  cupidus seedling hills palmtree sphagnum livis...    mk  0.028030
449   v messages pls samuel compilers libraries math...    bu  0.026426
2591  smooth bpx dotmatrix feeders memory networking...    os  0.025162
4033  orchids thorns order aroids alocasia heliampho...    os  0.024658
6265  address strain cultivator sensi seeds pablo ma...    dr  0.021162
4483  address strain cultivator sensi seeds pablo ma...    dr  0.021023
4706  street purple lady cab mdma procaine lab test ...    dr  0.020988
2075  street purple lady cab mdma procaine lab test ...    dr  0.020988
941   bealeii airy shrub invincibelle spirit pp pink...    os  0.

'resource schuylkill center facebook twitter instagram rights reserved native history bird watcher team building programs wildlife fund exhibits water art program scee butterfly house exelon gifts haas trustees staff partnerships resources seedlings view detour birding departments nature preschool teacher membership toad paper peco trust geology jdb pa parent child fun green field exhibit us reptile envirothon rights initiatives resource conservation service problem museum service wellness game party art peco energy scout fees simpson paper company challenge trail panels season education amphitheater penn squirrel philadelphia envirothon overview competition topics past winners water department health camp haas company summer camp conservation association gardener center naturalist birds appreciation delaware kind birthday party bureau mclean rohm park commission gardener gallery workshop land past roxborough review pa department nature lover kindergarten teachers registration preserva

In [50]:
df_results = pd.DataFrame(list(zip(df_test['clean_comp_nouns'],df_test['label'])),columns=['terms','category'])
df_results['lsh_predict'] = df_results.apply(lambda x: predict_mod(x['terms'],df_train,permutations,num_similars,forest),axis=1)
df_results['match'] = np.where(df_results['category']==df_results['lsh_predict'],1,0)

In [51]:
df_results[df_results['lsh_predict'].isna()]

Unnamed: 0,terms,category,lsh_predict,match
347,mabuya angularis phelsumania tail lowland abbo...,ed,,0
777,sexfilme sexfilme amateursex amateurporno amat...,sx,,0


In [52]:
rpt = mr.generate_report(df_results)
rpt

{'totalSamples': 2236,
 'overallAccuracy': 0.49105545617173524,
 'byCategory': {'sx': {'totalSamples': 332, 'accuracy': 0.8162650602409639},
  'dr': {'totalSamples': 331, 'accuracy': 0.6827794561933535},
  'sp': {'totalSamples': 318, 'accuracy': 0.44339622641509435},
  'ed': {'totalSamples': 330, 'accuracy': 0.43333333333333335},
  'os': {'totalSamples': 314, 'accuracy': 0.40764331210191085},
  'bu': {'totalSamples': 311, 'accuracy': 0.3440514469453376},
  'mk': {'totalSamples': 300, 'accuracy': 0.2733333333333333}}}

In [None]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(df_results['category'],df_results['lsh_predict'],labels=['os','ed','dr','sp','mk','sx','bu'])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
mr.plot_confusion_matrix(cnf_matrix, classes=['os','ed','dr','sp','mk','sx','bu'],
                      title='Confusion matrix, LSH Forest')

In [36]:
#Number of similars to return
num_similars = 15
test_item = 9
text = df_test['clean_comp_nouns'][test_item]
df_my_test_mod = predict(text,df_train,permutations,num_similars,forest)
print('actual label: ',df_test['label'][test_item])
df_my_test_mod['label'].value_counts()[:1].index.tolist()[0]

actual label:  ed


'os'