In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn import svm, linear_model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc,f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
import unicodedata
import random
random.seed(37)
import gensim
from gensim.models import Word2Vec
from annoy import AnnoyIndex

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
path = "/home/akash/projects/data_analysis/ing/data/"

In [4]:
companies = pd.read_csv(path+"G.csv", delimiter="|")
train = pd.read_csv(path+"STrain.csv", delimiter="|")

In [5]:
companies.head()

Unnamed: 0,company_id,name
0,634022,PRIMCOM SA
1,324497,The David Isaacs Fund
2,280848,Bramor Enterprises Limited
3,432662,NAVEXIM S.A.
4,524224,Magal Group SA


In [6]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450256 entries, 0 to 450255
Data columns (total 2 columns):
company_id    450256 non-null int64
name          450256 non-null object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB


In [7]:
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
train_index    100000 non-null int64
name           100000 non-null object
company_id     100000 non-null int64
dtypes: int64(2), object(1)
memory usage: 2.3+ MB


In [9]:
test = pd.read_csv(path+"STest.csv", delimiter=",")
test.head()

Unnamed: 0,test_index,name
0,0,THEking'S ROYAL HUSSARS OFFI. TRUST' TRUST
1,1,Southern Powe rcompany SICAV
2,2,BMO S&P/TSX Ladde. Share ETF Index
3,3,PaI
4,4,Clearview Two


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
test_index    100000 non-null int64
name          99999 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [11]:
print train.columns
print test.columns

Index([u'train_index', u'name', u'company_id'], dtype='object')
Index([u'test_index', u'name'], dtype='object')


In [12]:
print train.shape
print train["name"].nunique()

(100000, 3)
99365


In [13]:
print test.shape
print test["name"].nunique()

(100000, 2)
99323


In [14]:
print len(set(test["name"]) & set(train["name"]))

1075


In [15]:
companies = companies.reset_index(drop=True)
companies.head()

Unnamed: 0,company_id,name
0,634022,PRIMCOM SA
1,324497,The David Isaacs Fund
2,280848,Bramor Enterprises Limited
3,432662,NAVEXIM S.A.
4,524224,Magal Group SA


#### Sample Data

In [16]:
# orig_size = len(companies)
# sample_size = 200000 
# companies = companies.sample(sample_size)

In [17]:
# train_0 = train.loc[train["company_id"]==-1]
# print train_0.shape

# train_1 = train.loc[train["company_id"]>=0]
# print train_1.shape

In [18]:
# train = train_1.loc[train_1["company_id"].isin(companies["company_id"])]
# print train.shape

In [19]:
# train0_sample = train_0.sample(int(sample_size/float(orig_size)*len(train_0)))
# print len(train0_sample)

In [20]:
# train = pd.concat([train,train0_sample])

In [21]:
train.shape

(100000, 3)

In [22]:
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


#### Make map of company_ids to Annoy_index_ids

In [23]:
# annoy_ids are from 0 to len(company_ids) -1 
# annoy_ids are stored in list. List index are annoy_ids and the value is teh correspodnign company_id
# Company_ids are stored in a dict. The key is company id and the value is annoy_id
def build_company_annoy_maps(company_ids):
    annoy2company = []
    company2annoy = defaultdict(lambda:0)
    for c_id in company_ids:
        if c_id not in company2annoy:
            annoy2company.append(c_id)
            company2annoy[c_id] = len(annoy2company)-1
    return annoy2company, company2annoy  
        

In [24]:
annoy2company, company2annoy = build_company_annoy_maps(companies["company_id"].values)

In [25]:
print len(annoy2company)
print len(company2annoy)

450256
450256


In [26]:
import random
for i in range(5):
    annoy_id = random.randint(0,50000)
    assert annoy_id ==  company2annoy[annoy2company[annoy_id]]
    print annoy_id, annoy2company[annoy_id], company2annoy[annoy2company[annoy_id]]

34100 198710 34100
4580 484544 4580
30891 320028 30891
42096 89805 42096
41728 232717 41728


In [27]:
def remove_accents(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    def remove_accents_inner(input_str):
        nfkd_form = unicodedata.normalize('NFKD', unicode(input_str, 'utf8'))
        return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    
    df[new_col] = df[old_col].apply(remove_accents_inner)
    return df

def make_accent_transformer(old_col, new_col):
    return FunctionTransformer(remove_accents, validate=False,
                                         kw_args={"old_col":old_col,"new_col":new_col})

In [28]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/akash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]|@,;\-\\\+#~!$%^]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STOPWORDS = set(stopwords.words('english'))
#-#+_!`~\\

In [30]:
s = "for from india and add+g-ddc.b.g,c\g-hh ak|ash#sin/gh dee$pt^a su%d"
s=re.sub(REPLACE_BY_SPACE_RE," ",s)
s= re.sub(BAD_SYMBOLS_RE,"",s)
s= ' '.join(word for word in s.split() if word not in STOPWORDS)
print s

india add g ddcbg c g hh ak ash sin gh dee pt su


In [31]:
def clean_name(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    
    def regex_clean(text):
        text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
        text = re.sub(BAD_SYMBOLS_RE, "", text)
#         #add shorthand
#         longhand = ' '.join(word for word in text.split() if not any(c.isdigit() for c in word))
#         shorthand = "".join(word[0] for word in re.findall("\w+", longhand))
#         text = text+" "+shorthand
        return text
    
    df[new_col] = df[old_col].str.lower().str.strip()
    df[new_col] = df[old_col].apply(regex_clean)
    
    return df

def make_clean_name_transformer(old_col, new_col):
    return FunctionTransformer(clean_name, validate=False, kw_args={"old_col":old_col,"new_col":new_col})

In [32]:
name_pipeline = make_pipeline(make_accent_transformer("name", "clean_name"),
                                    make_clean_name_transformer("clean_name", "clean_name"))
companies = name_pipeline.transform(companies)

In [33]:
companies.sample(10)

Unnamed: 0,company_id,name,clean_name
72291,245672,Redwood Capital Bank,redwood capital bank
46731,547523,W.J. Thijn Stichting,wj thijn stichting
174399,283043,"Washington Gardens Plantation, LLC",washington gardens plantation llc
98868,379243,GENEDRIVE PLC,genedrive plc
301109,543762,PYME VALENCIA 1 FTA,pyme valencia 1 fta
93697,537995,LEXELL,lexell
205431,32903,TEHATTA HOLDINGS LIMITED,tehatta holdings limited
416788,409946,"Petersen Bros., Inc.",petersen bros inc
307423,446826,Aarslev Skovgård/Laust Bie,aarslev skovgard laust bie
346428,412894,DISMAG,dismag


In [34]:
print companies.shape
print companies["clean_name"].nunique()

(450256, 3)
448231


In [35]:
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


In [36]:
print train.shape
train.sample(10)

(100000, 3)


Unnamed: 0,train_index,name,company_id
57053,57053,NNL,608558
50544,50544,Sarak Fastigheter AB,533371
72321,72321,Providence TMT Special Fundsituations L.P.,-1
75318,75318,Focus UAB,167145
87871,87871,"""uBASF POLSKA"" SPÓŁKA Z ODPO.",484642
10321,10321,CFORA METAL Global,567081
810,810,MetLifereinsurance Carolina of South Inc Company,461217
16725,16725,"UNIV., THE CMI GLOBAL NETWORK FUND - CMI GERMAN EQUITY SUB-FUND",533991
60269,60269,POWSZ. TOWA RZYSTWOinwestycyjne S.A.,442384
65646,65646,Cnnell Bros. CHKL International,320341


In [37]:
from collections import Counter
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1)
# X matrix where the row represents sentences and column is our one-hot vector for each token in our vocabulary
X = count_vectorizer.fit_transform(companies["clean_name"])

# Vocabulary
vocab = list(count_vectorizer.get_feature_names())

# Column-wise sum of the X matrix.
# It's some crazy numpy syntax that looks horribly unpythonic
# For details, see http://stackoverflow.com/questions/3337301/numpy-matrix-to-array
# and http://stackoverflow.com/questions/13567345/how-to-calculate-the-sum-of-all-columns-of-a-2d-numpy-array-efficiently
counts = X.sum(axis=0).A1

freq_distribution = Counter(dict(zip(vocab, counts)))

In [38]:
print (freq_distribution.most_common(100))

[(u'llc', 44288), (u'fund', 38551), (u'limited', 34803), (u'gmbh', 29272), (u'inc', 24952), (u'bv', 21019), (u'srl', 20646), (u'trust', 20440), (u'ltd', 16548), (u'sa', 15202), (u'co', 12984), (u'lp', 11681), (u'global', 11272), (u'de', 10292), (u'the', 10220), (u'company', 10159), (u'funds', 9906), (u'kg', 9865), (u'holding', 9770), (u'ab', 9459), (u'investment', 8335), (u'sl', 7666), (u'capital', 7485), (u'international', 7283), (u'equity', 6886), (u'di', 6700), (u'holdings', 6683), (u'of', 6680), (u'spa', 6663), (u'group', 6212), (u'as', 6092), (u'ii', 6061), (u'sicav', 5883), (u'management', 5723), (u'investments', 5456), (u'aps', 5192), (u'portfolio', 4959), (u'bond', 4854), (u'partners', 4848), (u'societa', 4605), (u'invest', 4526), (u'and', 4211), (u'oy', 4152), (u'sro', 4143), (u'bank', 3981), (u'plc', 3861), (u'services', 3851), (u'income', 3848), (u'master', 3799), (u'corporation', 3747), (u'nv', 3573), (u'beheer', 3568), (u'mbh', 3509), (u'spoka', 3506), (u'ag', 3479), (u'se

In [39]:
train = name_pipeline.transform(train)

In [40]:
train.sample(10)

Unnamed: 0,train_index,name,company_id,clean_name
75813,75813,ABERDEEN PRIVATE EQUITY MANAGERS LIMITED,614554,aberdeen private equity managers limited
87288,87288,INVERNIZZI LUCIANO & FvIGLI S.P.A.,600698,invernizzi luciano fvigli spa
9251,9251,Stahlgruber Ottog. AG,-1,stahlgruber ottog ag
22655,22655,GAoERIE ENRICO NAVARRA,-1,gaoerie enrico navarra
9301,9301,VCI,188349,vci
24876,24876,DAS AHUS S.R.L.,436081,das ahus srl
88339,88339,MI-FONDS F95,138767,mi fonds f95
96882,96882,Aretea Special Inc Marit imeenterprise,-1,aretea special inc marit imeenterprise
49862,49862,Cb Holding ApS Hanstholm,-1,cb holding aps hanstholm
98963,98963,"2020, L.C.",382712,2020 lc


In [41]:
import gensim
assert gensim.models.doc2vec.FAST_VERSION > -1

In [42]:
companies.head(10)

Unnamed: 0,company_id,name,clean_name
0,634022,PRIMCOM SA,primcom sa
1,324497,The David Isaacs Fund,the david isaacs fund
2,280848,Bramor Enterprises Limited,bramor enterprises limited
3,432662,NAVEXIM S.A.,navexim sa
4,524224,Magal Group SA,magal group sa
5,513585,Marly SPF S.A.,marly spf sa
6,354496,I.T APPARELS LIMITED,it apparels limited
7,381944,VX 30.141 ApS,vx 30141 aps
8,526057,Rydex ETF Trust - Guggenheim S&P 500 Equal Weight ETF,rydex etf trust guggenheim sp 500 equal weight etf
9,34381,Rydex Series Funds - Retailing Fund,rydex series funds retailing fund


In [43]:
b='student ddg dfs'
n=4
print [b[i:i+n] for i in range(len(b)-n+1)]

['stud', 'tude', 'uden', 'dent', 'ent ', 'nt d', 't dd', ' ddg', 'ddg ', 'dg d', 'g df', ' dfs']


#### SETTINGS

In [44]:
n_grams=[2,3]
dim =400
window =5
min_count=5
workers=4
num_trees_annoy=300

In [45]:
def make_corpus(names, n_grams= [3]):
    corpus = []
    for name in names:
        name_grams = []
        for n in n_grams:
            name_grams.extend([name[i:i+n] for i in range(len(name)-n+1)])
        corpus.append(name_grams)
    return corpus

In [46]:
def make_name_vectors(corpus):
    name_vectors = np.empty(shape=(len(corpus),dim))
    for idx, name in enumerate(corpus):
        name_vector = name_to_vec(name, model.wv)
        name_vectors[idx,:] = name_vector
    return name_vectors

In [47]:
def name_to_vec(name, embeddings, dim=dim):
    """
        name: a string
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation

        result: vector representation for the question
    """
    
    result = np.zeros(shape=dim)
    count = 0
    for word in name:
        if word in embeddings:
            result = result + embeddings[word]
            count+=1
    
    if count>0:
        result = result/count
        
    return result

In [48]:
company_corpus = make_corpus(companies["clean_name"], n_grams)

In [49]:
len(company_corpus)

450256

In [50]:
companies.head()

Unnamed: 0,company_id,name,clean_name
0,634022,PRIMCOM SA,primcom sa
1,324497,The David Isaacs Fund,the david isaacs fund
2,280848,Bramor Enterprises Limited,bramor enterprises limited
3,432662,NAVEXIM S.A.,navexim sa
4,524224,Magal Group SA,magal group sa


In [51]:
company_corpus[0:5]

[[u'pr',
  u'ri',
  u'im',
  u'mc',
  u'co',
  u'om',
  u'm ',
  u' s',
  u'sa',
  u'pri',
  u'rim',
  u'imc',
  u'mco',
  u'com',
  u'om ',
  u'm s',
  u' sa'],
 [u'th',
  u'he',
  u'e ',
  u' d',
  u'da',
  u'av',
  u'vi',
  u'id',
  u'd ',
  u' i',
  u'is',
  u'sa',
  u'aa',
  u'ac',
  u'cs',
  u's ',
  u' f',
  u'fu',
  u'un',
  u'nd',
  u'the',
  u'he ',
  u'e d',
  u' da',
  u'dav',
  u'avi',
  u'vid',
  u'id ',
  u'd i',
  u' is',
  u'isa',
  u'saa',
  u'aac',
  u'acs',
  u'cs ',
  u's f',
  u' fu',
  u'fun',
  u'und'],
 [u'br',
  u'ra',
  u'am',
  u'mo',
  u'or',
  u'r ',
  u' e',
  u'en',
  u'nt',
  u'te',
  u'er',
  u'rp',
  u'pr',
  u'ri',
  u'is',
  u'se',
  u'es',
  u's ',
  u' l',
  u'li',
  u'im',
  u'mi',
  u'it',
  u'te',
  u'ed',
  u'bra',
  u'ram',
  u'amo',
  u'mor',
  u'or ',
  u'r e',
  u' en',
  u'ent',
  u'nte',
  u'ter',
  u'erp',
  u'rpr',
  u'pri',
  u'ris',
  u'ise',
  u'ses',
  u'es ',
  u's l',
  u' li',
  u'lim',
  u'imi',
  u'mit',
  u'ite',
  u'ted'],
 

In [52]:
print gensim.models.doc2vec.FAST_VERSION

1


In [53]:
model = Word2Vec(company_corpus, size=dim, window=window, min_count=min_count, workers=workers)
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f8f0efd5950>

In [54]:
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f8f0efd5950>

In [55]:
model.wv["llc"].size

300

In [56]:
company_vectors = make_name_vectors(company_corpus)

In [57]:
company_vectors[0:5]

array([[ 0.29744164, -0.31892795,  0.02504915, ..., -0.02284963,
        -0.33021586,  0.03254006],
       [ 0.00308102,  0.14935247,  0.31973143, ...,  0.22377759,
        -0.34552417,  0.21090331],
       [ 0.3201684 ,  0.43908448,  0.23285988, ..., -0.02601147,
        -0.35725095, -0.01262979],
       [-0.05307632,  0.14373445,  0.30242449, ..., -0.41301112,
        -0.2799158 ,  0.17310317],
       [ 0.54007183,  0.09744463,  0.20262157, ..., -0.08723755,
        -0.10712737, -0.54897497]])

In [58]:
index_size = dim
table = AnnoyIndex(index_size)

for i in range(company_vectors.shape[0]):
    if i%20000==0:
        print "indexed %s items"%i
    table.add_item(i,company_vectors[i])

indexed 0 items
indexed 20000 items
indexed 40000 items
indexed 60000 items
indexed 80000 items
indexed 100000 items
indexed 120000 items
indexed 140000 items
indexed 160000 items
indexed 180000 items
indexed 200000 items
indexed 220000 items
indexed 240000 items
indexed 260000 items
indexed 280000 items
indexed 300000 items
indexed 320000 items
indexed 340000 items
indexed 360000 items
indexed 380000 items
indexed 400000 items
indexed 420000 items
indexed 440000 items


In [59]:
table.build(num_trees_annoy)

True

In [60]:
table.save("annoy_w2vec_index.ann")

True

In [61]:
table = AnnoyIndex(index_size)
table.load("annoy_w2vec_index.ann")

True

#### Reteiveal

In [62]:
train = train.merge(companies, how="left", on="company_id", suffixes=('','_truth'))

In [63]:
train.head()

Unnamed: 0,train_index,name,company_id,clean_name,name_truth,clean_name_truth
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358,trattamento ltd rifiuti metropolitani spa siglabile trm spa,TRATTAMENTO RIFIUTI METROPOLITANI S.P.A. SIGLABILE TRM S.P.A.,trattamento rifiuti metropolitani spa siglabile trm spa
1,1,A IRL Fuund,568472,a irl fuund,ITALIAN CREDIT SOCIETA' A RESPONSABILITA' LIMITATA,italian credit societa a responsabilita limitata
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692,bmr 500 kendall llc 1 mezz gmbh,BRE-BMR-500 Kendall Mezz 1 LLC,bre bmr 500 kendall mezz 1 llc
3,3,Solich GmbH KG,-1,solich gmbh kg,,
4,4,Drzyzzga Funds Logi. sp. z oo,404178,drzyzzga funds logi sp z oo,Drzyzga Logistics Group sp. z o.o.,drzyzga logistics group sp z oo


In [64]:
train_corpus = make_corpus(train["clean_name"], n_grams)

In [65]:
train_corpus[0:5]

[[u'tr',
  u'ra',
  u'at',
  u'tt',
  u'ta',
  u'am',
  u'me',
  u'en',
  u'nt',
  u'to',
  u'o ',
  u' l',
  u'lt',
  u'td',
  u'd ',
  u' r',
  u'ri',
  u'if',
  u'fi',
  u'iu',
  u'ut',
  u'ti',
  u'i ',
  u' m',
  u'me',
  u'et',
  u'tr',
  u'ro',
  u'op',
  u'po',
  u'ol',
  u'li',
  u'it',
  u'ta',
  u'an',
  u'ni',
  u'i ',
  u' s',
  u'sp',
  u'pa',
  u'a ',
  u'  ',
  u' s',
  u'si',
  u'ig',
  u'gl',
  u'la',
  u'ab',
  u'bi',
  u'il',
  u'le',
  u'e ',
  u' t',
  u'tr',
  u'rm',
  u'm ',
  u' s',
  u'sp',
  u'pa',
  u'tra',
  u'rat',
  u'att',
  u'tta',
  u'tam',
  u'ame',
  u'men',
  u'ent',
  u'nto',
  u'to ',
  u'o l',
  u' lt',
  u'ltd',
  u'td ',
  u'd r',
  u' ri',
  u'rif',
  u'ifi',
  u'fiu',
  u'iut',
  u'uti',
  u'ti ',
  u'i m',
  u' me',
  u'met',
  u'etr',
  u'tro',
  u'rop',
  u'opo',
  u'pol',
  u'oli',
  u'lit',
  u'ita',
  u'tan',
  u'ani',
  u'ni ',
  u'i s',
  u' sp',
  u'spa',
  u'pa ',
  u'a  ',
  u'  s',
  u' si',
  u'sig',
  u'igl',
  u'gla',
  u'lab',

In [66]:
#train_vectors = tfidf_vectorizer.transform(train["clean_name"])
train_vectors = make_name_vectors(train_corpus)

In [67]:
train_vectors[0:5]

array([[ 0.31575099,  0.30591132, -0.14162671, ..., -0.34862647,
        -0.31480219, -0.05502205],
       [ 0.69992434, -0.04381567,  0.0968002 , ...,  0.25603873,
         0.02132178,  0.65427645],
       [-0.40363997,  0.10854598, -0.08780978, ...,  0.13526529,
        -0.21051361,  0.29527278],
       [-0.02022164,  0.4876231 ,  0.2092411 , ...,  0.58622823,
        -0.1294124 ,  0.46231079],
       [-0.05229993,  0.33571559, -0.02351667, ...,  0.38758845,
         0.16435868,  0.27660724]])

In [68]:
def get_nearest_neighbours(vectors, num_neighbours=1, search_nodes=-1, include_distances=True):
    neighbours = np.empty(shape=(len(vectors),num_neighbours), dtype=np.int32)
    distances = np.empty(shape=(len(vectors),num_neighbours))
#     print len(vectors)
#     print neighbours.shape
#     print distances.shape
    for idx,v in enumerate(vectors):
        annoy_ids, annoy_distances = table.get_nns_by_vector(v, n=num_neighbours, search_k=search_nodes, include_distances=include_distances)
        neighbours[idx,:] = annoy_ids
        distances[idx,:] = annoy_distances
    return neighbours, distances

In [69]:
neighbours, distances = get_nearest_neighbours(train_vectors)

In [70]:
neigbours =np.vectorize(lambda x: annoy2company[x])(neighbours)

In [71]:
rank_1 = neigbours[:,0]
rank_1

array([177358,  24825, 195692, ..., 377115, 192951, 386428])

In [72]:
train["rank1_id"] = rank_1

In [73]:
train= train.merge(companies[['company_id', 'clean_name']], left_on="rank1_id", 
                   right_on="company_id", suffixes=("","_rank1"))

In [74]:
train

Unnamed: 0,train_index,name,company_id,clean_name,name_truth,clean_name_truth,rank1_id,company_id_rank1,clean_name_rank1
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358,trattamento ltd rifiuti metropolitani spa siglabile trm spa,TRATTAMENTO RIFIUTI METROPOLITANI S.P.A. SIGLABILE TRM S.P.A.,trattamento rifiuti metropolitani spa siglabile trm spa,177358,177358,trattamento rifiuti metropolitani spa siglabile trm spa
1,1,A IRL Fuund,568472,a irl fuund,ITALIAN CREDIT SOCIETA' A RESPONSABILITA' LIMITATA,italian credit societa a responsabilita limitata,24825,24825,fqa fund
2,2347,PS2A Fund,-1,ps2a fund,,,24825,24825,fqa fund
3,5361,IHL FUND.,-1,ihl fund,,,24825,24825,fqa fund
4,16823,BNL FUND,560940,bnl fund,LLOYDS BANK NOMINEES LIMITED,lloyds bank nominees limited,24825,24825,fqa fund
5,17491,YTL Fund,438121,ytl fund,"YBOR Tropical, LLC",ybor tropical llc,24825,24825,fqa fund
6,17525,X STL Fund,516763,x stl fund,Spire X Trading LLC,spire x trading llc,24825,24825,fqa fund
7,23518,a HaTB FUND,-1,a hatb fund,,,24825,24825,fqa fund
8,25932,SAL Fund,466656,sal fund,Svitzer Australia Pty Ltd,svitzer australia pty ltd,24825,24825,fqa fund
9,35389,hMGA FUND,632543,hmga fund,MGA,mga,24825,24825,fqa fund


In [75]:
print len(train)
print train.loc[train["company_id"]==train["rank1_id"]].shape
print train.loc[train["company_id"]!=train["rank1_id"]].shape

100000
(36403, 9)
(63597, 9)


In [76]:
train_sub = train.loc[train["company_id"]!=-1]
print np.sum(train_sub["company_id"] == train_sub["rank1_id"])/float(len(train_sub))

0.5219517091075935


##### Accuracy = 0.5219
* n_grams=[2,3] 
* dim =300
* window =5
* min_count=5
* workers=4
* num_trees_annoy=300


