# Company Name Matching
### Approach 
1. Preprocess to noramlize names: remove spaces, characters, lowercase, 
2. Build name embeddings using [StarSpace](https://github.com/facebookresearch/StarSpace)
3. Index Using [Annoy](https://github.com/spotify/annoy)
4. Evaluate Using accuracy on non -1 companies.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn import svm, linear_model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc,f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
import unicodedata
import random
random.seed(37)
import gensim
from gensim.models import Word2Vec
from annoy import AnnoyIndex

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
data_path = "data/"
file_path = "files/"

In [4]:
companies = pd.read_csv(data_path+"G.csv", delimiter="|")
train = pd.read_csv(data_path+"STrain.csv", delimiter="|")

In [5]:
companies.head()

Unnamed: 0,company_id,name
0,634022,PRIMCOM SA
1,324497,The David Isaacs Fund
2,280848,Bramor Enterprises Limited
3,432662,NAVEXIM S.A.
4,524224,Magal Group SA


In [6]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450256 entries, 0 to 450255
Data columns (total 2 columns):
company_id    450256 non-null int64
name          450256 non-null object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB


In [7]:
companies["company_id"].nunique()

450256

In [8]:
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
train_index    100000 non-null int64
name           100000 non-null object
company_id     100000 non-null int64
dtypes: int64(2), object(1)
memory usage: 2.3+ MB


In [10]:
print train.columns

Index([u'train_index', u'name', u'company_id'], dtype='object')


In [11]:
print train.shape
print train["name"].nunique()

(100000, 3)
99365


In [12]:
companies = companies.reset_index(drop=True)
companies.head()

Unnamed: 0,company_id,name
0,634022,PRIMCOM SA
1,324497,The David Isaacs Fund
2,280848,Bramor Enterprises Limited
3,432662,NAVEXIM S.A.
4,524224,Magal Group SA


In [13]:
train.shape

(100000, 3)

In [14]:
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


### Pre-process data

In [15]:
def remove_accents(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    
    def remove_accents_inner(input_str):
        try:
            nfkd_form = unicodedata.normalize('NFKD', unicode(input_str, 'utf8'))
        except UnicodeDecodeError:
#             print input_str
#             input_str = input_str.encode('utf-8')
#             nfkd_form = unicodedata.normalize('NFKD', unicode(input_str, 'utf8'))
            return None
        return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    
    df[new_col] = df[old_col].apply(remove_accents_inner)
    return df

def make_accent_transformer(old_col, new_col):
    return FunctionTransformer(remove_accents, validate=False,
                                         kw_args={"old_col":old_col,"new_col":new_col})

In [16]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/akash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]|@,;\-\\\+#~!$%^]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STOPWORDS = set(stopwords.words('english'))
#-#+_!`~\\

In [18]:
s = "for from india and add+g-ddc.b.g,c\g-hh ak|ash#sin/gh dee$pt^a su%d"
s=re.sub(REPLACE_BY_SPACE_RE," ",s)
s= re.sub(BAD_SYMBOLS_RE,"",s)
s= ' '.join(word for word in s.split() if word not in STOPWORDS)
print s

india add g ddcbg c g hh ak ash sin gh dee pt su


In [19]:
def clean_name(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    
    def regex_clean(text):
        try:
            text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
            text = re.sub(BAD_SYMBOLS_RE, "", text)
#         #add shorthand
#         longhand = ' '.join(word for word in text.split() if not any(c.isdigit() for c in word))
#         shorthand = "".join(word[0] for word in re.findall("\w+", longhand))
#         text = text+" "+shorthand
        except TypeError:
            print text
            
        return text
    
    def strip_extra_spaces(text):
        return " ".join(text.split())
    
    df[new_col] = df[old_col].str.lower().str.strip()
    df[new_col] = df[new_col].apply(regex_clean)
    df[new_col] = df[new_col].apply(strip_extra_spaces)
    
    return df

def make_clean_name_transformer(old_col, new_col):
    return FunctionTransformer(clean_name, validate=False, kw_args={"old_col":old_col,"new_col":new_col})

#### Make processing pipeline

In [20]:
name_pipeline = make_pipeline(make_accent_transformer("name", "clean_name"),
                                    make_clean_name_transformer("clean_name", "clean_name"))

#### Transform company names

In [21]:
companies = name_pipeline.transform(companies)
print companies.shape
print companies["clean_name"].nunique()

(450256, 3)
448050


#### Check top occurring words

In [22]:
from collections import Counter
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1)
# X matrix where the row represents sentences and column is our one-hot vector for each token in our vocabulary
X = count_vectorizer.fit_transform(companies["clean_name"])

# Vocabulary
vocab = list(count_vectorizer.get_feature_names())

# Column-wise sum of the X matrix.
# It's some crazy numpy syntax that looks horribly unpythonic
# For details, see http://stackoverflow.com/questions/3337301/numpy-matrix-to-array
# and http://stackoverflow.com/questions/13567345/how-to-calculate-the-sum-of-all-columns-of-a-2d-numpy-array-efficiently
counts = X.sum(axis=0).A1

freq_distribution = Counter(dict(zip(vocab, counts)))

In [23]:
print (freq_distribution.most_common(100))

[(u'llc', 44288), (u'fund', 38551), (u'limited', 34803), (u'gmbh', 29272), (u'inc', 24952), (u'bv', 21019), (u'srl', 20646), (u'trust', 20440), (u'ltd', 16548), (u'sa', 15202), (u'co', 12984), (u'lp', 11681), (u'global', 11272), (u'de', 10292), (u'the', 10220), (u'company', 10159), (u'funds', 9906), (u'kg', 9865), (u'holding', 9770), (u'ab', 9459), (u'investment', 8335), (u'sl', 7666), (u'capital', 7485), (u'international', 7283), (u'equity', 6886), (u'di', 6700), (u'holdings', 6683), (u'of', 6680), (u'spa', 6663), (u'group', 6212), (u'as', 6092), (u'ii', 6061), (u'sicav', 5883), (u'management', 5723), (u'investments', 5456), (u'aps', 5192), (u'portfolio', 4959), (u'bond', 4854), (u'partners', 4848), (u'societa', 4605), (u'invest', 4526), (u'and', 4211), (u'oy', 4152), (u'sro', 4143), (u'bank', 3981), (u'plc', 3861), (u'services', 3851), (u'income', 3848), (u'master', 3799), (u'corporation', 3747), (u'nv', 3573), (u'beheer', 3568), (u'mbh', 3509), (u'spoka', 3506), (u'ag', 3479), (u'se

#### Pre-process train

In [24]:
train = name_pipeline.transform(train)

In [25]:
train.head()

Unnamed: 0,train_index,name,company_id,clean_name
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358,trattamento ltd rifiuti metropolitani spa siglabile trm spa
1,1,A IRL Fuund,568472,a irl fuund
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692,bmr 500 kendall llc 1 mezz gmbh
3,3,Solich GmbH KG,-1,solich gmbh kg
4,4,Drzyzzga Funds Logi. sp. z oo,404178,drzyzzga funds logi sp z oo


In [26]:
# Settings for embeddings and indexing model
DIM = 100
NUM_TREES_ANNOY = 200
N_GRAMS =[3]

In [27]:
# function to make training name corpus using n grams
def make_corpus(names, n_grams):
    corpus = []
    for name in names:
        name_grams = []
        for n in n_grams:
            grams = [name[i:i+n] for i in range(len(name)-n+1)]
            name_grams.extend(grams)
        name_grams = [ gram.replace(" ","_") for gram in name_grams]
        corpus.append(" ".join(name_grams))
    return corpus

#### Make corpus of ngrams

In [28]:
company_corpus =  make_corpus(companies["clean_name"], N_GRAMS)

In [29]:
companies["n_grams"] = company_corpus
companies

Unnamed: 0,company_id,name,clean_name,n_grams
0,634022,PRIMCOM SA,primcom sa,pri rim imc mco com om_ m_s _sa
1,324497,The David Isaacs Fund,the david isaacs fund,the he_ e_d _da dav avi vid id_ d_i _is isa saa aac acs cs_ s_f _fu fun und
2,280848,Bramor Enterprises Limited,bramor enterprises limited,bra ram amo mor or_ r_e _en ent nte ter erp rpr pri ris ise ses es_ s_l _li lim imi mit ite ted
3,432662,NAVEXIM S.A.,navexim sa,nav ave vex exi xim im_ m_s _sa
4,524224,Magal Group SA,magal group sa,mag aga gal al_ l_g _gr gro rou oup up_ p_s _sa
5,513585,Marly SPF S.A.,marly spf sa,mar arl rly ly_ y_s _sp spf pf_ f_s _sa
6,354496,I.T APPARELS LIMITED,it apparels limited,it_ t_a _ap app ppa par are rel els ls_ s_l _li lim imi mit ite ted
7,381944,VX 30.141 ApS,vx 30141 aps,vx_ x_3 _30 301 014 141 41_ 1_a _ap aps
8,526057,Rydex ETF Trust - Guggenheim S&P 500 Equal Weight ETF,rydex etf trust guggenheim sp 500 equal weight etf,ryd yde dex ex_ x_e _et etf tf_ f_t _tr tru rus ust st_ t_g _gu gug ugg gge gen enh nhe hei eim im_ m_s _sp sp_ p_5 _50 500 00_ 0_e _eq equ qua ual al_ l_w _we wei eig igh ght ht_ t_e _et etf
9,34381,Rydex Series Funds - Retailing Fund,rydex series funds retailing fund,ryd yde dex ex_ x_s _se ser eri rie ies es_ s_f _fu fun und nds ds_ s_r _re ret eta tai ail ili lin ing ng_ g_f _fu fun und


In [30]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450256 entries, 0 to 450255
Data columns (total 4 columns):
company_id    450256 non-null int64
name          450256 non-null object
clean_name    450256 non-null object
n_grams       450256 non-null object
dtypes: int64(1), object(3)
memory usage: 13.7+ MB


In [31]:
train["n_grams"] = make_corpus(train["clean_name"], N_GRAMS)

In [32]:
train

Unnamed: 0,train_index,name,company_id,clean_name,n_grams
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358,trattamento ltd rifiuti metropolitani spa siglabile trm spa,tra rat att tta tam ame men ent nto to_ o_l _lt ltd td_ d_r _ri rif ifi fiu iut uti ti_ i_m _me met etr tro rop opo pol oli lit ita tan ani ni_ i_s _sp spa pa_ a_s _si sig igl gla lab abi bil ile le_ e_t _tr trm rm_ m_s _sp spa
1,1,A IRL Fuund,568472,a irl fuund,a_i _ir irl rl_ l_f _fu fuu uun und
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692,bmr 500 kendall llc 1 mezz gmbh,bmr mr_ r_5 _50 500 00_ 0_k _ke ken end nda dal all ll_ l_l _ll llc lc_ c_1 _1_ 1_m _me mez ezz zz_ z_g _gm gmb mbh
3,3,Solich GmbH KG,-1,solich gmbh kg,sol oli lic ich ch_ h_g _gm gmb mbh bh_ h_k _kg
4,4,Drzyzzga Funds Logi. sp. z oo,404178,drzyzzga funds logi sp z oo,drz rzy zyz yzz zzg zga ga_ a_f _fu fun und nds ds_ s_l _lo log ogi gi_ i_s _sp sp_ p_z _z_ z_o _oo
5,5,BERK ELEY LLP,438615,berk eley llp,ber erk rk_ k_e _el ele ley ey_ y_l _ll llp
6,6,SCHNEIDER ELECTRIC PROTECTION & CONTROLE,507569,schneider electric protection controle,sch chn hne nei eid ide der er_ r_e _el ele lec ect ctr tri ric ic_ c_p _pr pro rot ote tec ect cti tio ion on_ n_c _co con ont ntr tro rol ole
7,7,DL ISL Inc,-1,dl isl inc,dl_ l_i _is isl sl_ l_i _in inc
8,8,Arendicom GmbH,495046,arendicom gmbh,are ren end ndi dic ico com om_ m_g _gm gmb mbh
9,9,RBPA Leeuw,526042,rbpa leeuw,rbp bpa pa_ a_l _le lee eeu euw


### Prepare training data for learning embeddings.
*  Word2vec learns embeddings based on the cotext of words in corpus. Starspace can be trained to learn  embeddings (word, images etc.) based on specific task. Thus it can learn better embeddings. In our case the task is name similarity.
* Starspace training format needs pairs of similar setences. So we use names from the train set with a m,atching company id in companies set
* <b>Hack</b>: Training set has only a fraction of all the company names. For company names not present in train set we add make a pair using the true company name.

In [33]:
train_data = companies.merge(train[["clean_name", "n_grams","company_id"]], how="left", on="company_id", suffixes=('','_train'))

In [34]:
train_data["company_id"].value_counts()

32434     5
425226    5
165777    4
634977    4
407946    4
440786    4
88538     4
209368    4
177140    4
295600    4
638323    3
610823    3
207022    3
172728    3
269318    3
618417    3
198931    3
110194    3
204786    3
365550    3
306655    3
499321    3
603660    3
595973    3
530204    3
170350    3
328231    3
196758    3
224132    3
186928    3
         ..
80243     1
65908     1
55631     1
51533     1
250158    1
49484     1
252207    1
205104    1
209202    1
211251    1
196916    1
203063    1
223545    1
227643    1
213308    1
215357    1
217406    1
41280     1
43329     1
45378     1
47427     1
33092     1
35141     1
37190     1
39239     1
57672     1
59721     1
61770     1
63819     1
0         1
Name: company_id, Length: 450256, dtype: int64

In [35]:
train_data["clean_name_train"] = train_data["clean_name_train"].fillna(train_data["clean_name"])
train_data["n_grams_train"] = train_data["n_grams_train"].fillna(train_data["n_grams"])

In [36]:
train_data.head()

Unnamed: 0,company_id,name,clean_name,n_grams,clean_name_train,n_grams_train
0,634022,PRIMCOM SA,primcom sa,pri rim imc mco com om_ m_s _sa,primcom sa,pri rim imc mco com om_ m_s _sa
1,324497,The David Isaacs Fund,the david isaacs fund,the he_ e_d _da dav avi vid id_ d_i _is isa saa aac acs cs_ s_f _fu fun und,the david isaacs fund,the he_ e_d _da dav avi vid id_ d_i _is isa saa aac acs cs_ s_f _fu fun und
2,280848,Bramor Enterprises Limited,bramor enterprises limited,bra ram amo mor or_ r_e _en ent nte ter erp rpr pri ris ise ses es_ s_l _li lim imi mit ite ted,bramor inves tment enterprises limited,bra ram amo mor or_ r_i _in inv nve ves es_ s_t _tm tme men ent nt_ t_e _en ent nte ter erp rpr pri ris ise ses es_ s_l _li lim imi mit ite ted
3,432662,NAVEXIM S.A.,navexim sa,nav ave vex exi xim im_ m_s _sa,navexim sa,nav ave vex exi xim im_ m_s _sa
4,524224,Magal Group SA,magal group sa,mag aga gal al_ l_g _gr gro rou oup up_ p_s _sa,magal group sa,mag aga gal al_ l_g _gr gro rou oup up_ p_s _sa


In [37]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455388 entries, 0 to 455387
Data columns (total 6 columns):
company_id          455388 non-null int64
name                455388 non-null object
clean_name          455388 non-null object
n_grams             455388 non-null object
clean_name_train    455388 non-null object
n_grams_train       455388 non-null object
dtypes: int64(1), object(5)
memory usage: 24.3+ MB


In [38]:
#train_data = train_data[["clean_name", "clean_name_train"]]
train_data = train_data[["n_grams", "n_grams_train"]]

In [39]:
train_data.to_csv(file_path+"star_train_ngrams.tsv",sep="\t",header=False)

### Learn Embeddings

In [40]:
#/home/akash/projects/Starspace/starspace train -trainFile star_train_ngrams.tsv -model star_space_ngram_embeddings -trainMode 3 -adagrad true -ngrams 1 -epoch 5 -dim 100 -similarity "cosine" -minCount 2 -verbose true -fileFormat labelDoc -negSearchLimit 10 -lr 0.05 -thread 4

#### Build star space embeddings for ngrams

In [41]:
star_space_dict = {}
with open(file_path+"/star_space_ngram_embeddings.tsv") as f:
    for line in f:
        line = line.split("\t")
        token = line[0]
        embedding = line[1:]
        embedding = np.asarray(embedding).astype(np.float32)     
        star_space_dict[token] = embedding      
starspace_embeddings = star_space_dict

In [42]:
print (starspace_embeddings['pri'])

[-0.119208    0.0991629   0.0938104   0.0660154   0.0747938   0.070145
  0.130213   -0.00070346  0.0130067  -0.0293467   0.0686699   0.0810226
 -0.012915   -0.0572339  -0.0317828   0.0658     -0.0950785   0.113955
  0.0173498   0.0279843  -0.0506483   0.125173   -0.0619245  -0.0726998
  0.0572543  -0.0402238  -0.0666396  -0.152028    0.0115009   0.121052
  0.0560702   0.0278294  -0.0976862  -0.026822   -0.00935354 -0.0404374
 -0.0716217   0.167094   -0.0235991   0.109166   -0.0032483   0.0235811
 -0.0627361  -0.0391092  -0.00491249 -0.102261   -0.0518779   0.0241284
 -0.032624   -0.125606    0.120422    0.101578   -0.0462269   0.0371669
 -0.0469588   0.103301   -0.0593593   0.0787304  -0.0110581   0.104398
 -0.0594812  -0.0300357  -0.119987    0.0919375   0.00581149  0.0338233
 -0.076168    0.0405324   0.0383356  -0.0534815  -0.038259    0.0862937
 -0.107805    0.0741736  -0.0283083  -0.0437005  -0.0513222  -0.00458159
  0.116911   -0.0208172   0.0741671   0.016341   -0.0341246  -0.077

In [43]:
def name_to_vec(name, embeddings):
    """
        name: a company name/sentence
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation

        result: vector representation for the name/sentence
    """
    
    result = np.zeros(shape=DIM, dtype=float)
    count = 0
    #print name
    for word in name.split():
        #print word
        if word in embeddings:
            #print "word %s in embedding"%word
            result = result + embeddings[word]
            count+=1
    if count>0:
        result = result/count
        
    return result

In [44]:
def make_name_vectors(corpus, embeddings):
    name_vectors = np.empty(shape=(len(corpus),DIM))
    for idx, name in enumerate(corpus):
        name_vector = name_to_vec(name, embeddings)
        name_vectors[idx,:] = name_vector
    return name_vectors

In [45]:
len(starspace_embeddings)

22004

#### Make embedding vectors of company names by taking the mean of ngram embeddings

In [46]:
company_vectors = make_name_vectors(companies["n_grams"], starspace_embeddings)

In [47]:
company_vectors[0:1]

array([[ 0.01709783,  0.00773632, -0.0021093 ,  0.01139055, -0.02135037,
         0.0146049 ,  0.03187227,  0.01187479,  0.00035386, -0.02018384,
        -0.00339857,  0.03907295, -0.01720327, -0.02469894,  0.02020706,
         0.00131685, -0.06940149,  0.07446843, -0.00120021,  0.00548082,
        -0.01596069, -0.00300824, -0.01614329, -0.04761717,  0.04952533,
        -0.02928773,  0.01583691, -0.03834475, -0.00712968,  0.01294641,
        -0.02833813, -0.00174684,  0.0097028 , -0.00415216,  0.01987893,
        -0.01534769,  0.03831299,  0.06816379,  0.0051133 ,  0.06606996,
        -0.00863443, -0.00949614, -0.01702975, -0.02929292, -0.01829027,
        -0.06338784, -0.0129171 , -0.04590585,  0.02071874, -0.03460466,
         0.04835812,  0.04926606,  0.01057489,  0.04920899,  0.02093529,
        -0.02822798,  0.02639537,  0.02562038, -0.0001477 ,  0.0099464 ,
        -0.00158709, -0.04102737, -0.01340589, -0.01744403, -0.01889571,
         0.0188464 , -0.04703097,  0.01120369, -0.0

### Indexing using Annoy.
Approximate nearest neigbour using random projections. Indexes once built are static files and can be used across processes.

#### Make map of company_ids to annoy_index_ids

In [48]:
# annoy_ids are from 0 to len(company_ids) -1 
# annoy_ids are stored in list. List index are annoy_ids and the value is teh correspodnign company_id
# Company_ids are stored in a dict. The key is company id and the value is annoy_id
def build_company_annoy_maps(company_ids):
    annoy2company = []
    company2annoy = defaultdict(lambda:0)
    for c_id in company_ids:
        if c_id not in company2annoy:
            annoy2company.append(c_id)
            company2annoy[c_id] = len(annoy2company)-1
    return annoy2company, company2annoy  
        

In [49]:
annoy2company, company2annoy = build_company_annoy_maps(companies["company_id"].values)

In [50]:
print len(annoy2company)
print len(company2annoy)

450256
450256


In [51]:
import random
for i in range(5):
    annoy_id = random.randint(0,50000)
    assert annoy_id ==  company2annoy[annoy2company[annoy_id]]
    print annoy_id, annoy2company[annoy_id], company2annoy[annoy2company[annoy_id]]

34100 198710 34100
4580 484544 4580
30891 320028 30891
42096 89805 42096
41728 232717 41728


#### Build annoy Index
Uncoment below to build the index

In [52]:
# index_size = DIM
# table = AnnoyIndex(index_size)

# for i in range(company_vectors.shape[0]):
#     if i%20000==0:
#         print "indexed %s items"%i
#     table.add_item(i,company_vectors[i])


#table.build(NUM_TREES_ANNOY)
#table.save(file_path+"annoy_starspace_ngram_index.ann")

###  Load Index and search

In [53]:
index_size = DIM
table = AnnoyIndex(index_size)
table.load(file_path+"annoy_starspace_ngram_index.ann")

True

###  Find neigbhours of train set


#### Build training name vectors

In [54]:
# function to fetch nearest neighbours
def get_nearest_neighbours(vectors, num_neighbours=1, search_nodes=-1, include_distances=True):
    neighbours = np.empty(shape=(len(vectors),num_neighbours), dtype=np.int32)
    distances = np.empty(shape=(len(vectors),num_neighbours))
#     print len(vectors)
#     print neighbours.shape
#     print distances.shape
    for idx,v in enumerate(vectors):
        annoy_ids, annoy_distances = table.get_nns_by_vector(v, n=num_neighbours, search_k=search_nodes, include_distances=include_distances)
        neighbours[idx,:] = annoy_ids
        distances[idx,:] = annoy_distances
    return neighbours, distances

#### We can retrieve n nearest neighborus, but since we need find only the ground truth I fetch only the nearest neighbour 


In [55]:
train.shape

(100000, 5)

In [56]:
%%time
train_vectors = make_name_vectors(train["n_grams"],starspace_embeddings)
neighbours, distances = get_nearest_neighbours(train_vectors)
neighbour_company_ids =np.vectorize(lambda x: annoy2company[x])(neighbours)

CPU times: user 26.4 s, sys: 35.7 ms, total: 26.5 s
Wall time: 26.5 s


#### Set nearest neighbour to be the prediction

In [57]:
rank_1 = neighbour_company_ids[:,0]
train["predicted_id"] = rank_1

In [58]:
train= train.merge(companies[['company_id', 'name']], left_on="predicted_id", 
                   right_on="company_id", suffixes=("","_predicted"))

In [59]:
train.head()

Unnamed: 0,train_index,name,company_id,clean_name,n_grams,predicted_id,company_id_predicted,name_predicted
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358,trattamento ltd rifiuti metropolitani spa siglabile trm spa,tra rat att tta tam ame men ent nto to_ o_l _lt ltd td_ d_r _ri rif ifi fiu iut uti ti_ i_m _me met etr tro rop opo pol oli lit ita tan ani ni_ i_s _sp spa pa_ a_s _si sig igl gla lab abi bil ile le_ e_t _tr trm rm_ m_s _sp spa,177358,177358,TRATTAMENTO RIFIUTI METROPOLITANI S.P.A. SIGLABILE TRM S.P.A.
1,1,A IRL Fuund,568472,a irl fuund,a_i _ir irl rl_ l_f _fu fuu uun und,223012,223012,IMPRESA EDILE CAMBREA ROCCO S.R.L
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692,bmr 500 kendall llc 1 mezz gmbh,bmr mr_ r_5 _50 500 00_ 0_k _ke ken end nda dal all ll_ l_l _ll llc lc_ c_1 _1_ 1_m _me mez ezz zz_ z_g _gm gmb mbh,195692,195692,BRE-BMR-500 Kendall Mezz 1 LLC
3,3,Solich GmbH KG,-1,solich gmbh kg,sol oli lic ich ch_ h_g _gm gmb mbh bh_ h_k _kg,497655,497655,Fröhlich GmbH
4,4,Drzyzzga Funds Logi. sp. z oo,404178,drzyzzga funds logi sp z oo,drz rzy zyz yzz zzg zga ga_ a_f _fu fun und nds ds_ s_l _lo log ogi gi_ i_s _sp sp_ p_z _z_ z_o _oo,331552,331552,SITS Sp. z o.o.


In [60]:
train[["train_index", "name", "company_id", "predicted_id", "name_predicted"]]

Unnamed: 0,train_index,name,company_id,predicted_id,name_predicted
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358,177358,TRATTAMENTO RIFIUTI METROPOLITANI S.P.A. SIGLABILE TRM S.P.A.
1,1,A IRL Fuund,568472,223012,IMPRESA EDILE CAMBREA ROCCO S.R.L
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692,195692,BRE-BMR-500 Kendall Mezz 1 LLC
3,3,Solich GmbH KG,-1,497655,Fröhlich GmbH
4,4,Drzyzzga Funds Logi. sp. z oo,404178,331552,SITS Sp. z o.o.
5,36093,Aurepio TRUST Sp. z o.o.,-1,331552,SITS Sp. z o.o.
6,5,BERK ELEY LLP,438615,20860,"Wesley Woods at New Albany, LLC"
7,6,SCHNEIDER ELECTRIC PROTECTION & CONTROLE,507569,507569,SCHNEIDER ELECTRIC PROTECTION & CONTROLE
8,7,DL ISL Inc,-1,256603,"Colonial Oil Industries, Inc."
9,8,Arendicom GmbH,495046,495046,Arendicom GmbH


### Evaluate
#### Since we do not handle -1 cases we only report accuracy.

In [61]:
total = len(train)
correct = train.loc[train["company_id"]==train["predicted_id"]].shape[0]
incorrect = train.loc[train["company_id"]!=train["predicted_id"]].shape[0]
print "Total: %d"%total
print "Correct Predictions %f"%(correct/float(total))
print "Incorrect Predictions %f"%(incorrect/float(total))

Total: 100000
Correct Predictions 0.372670
Incorrect Predictions 0.627330


### On excluding -1 cases accuracy is 53.4%

In [62]:
train_sub = train.loc[train["company_id"]!=-1]
print np.sum(train_sub["company_id"] == train_sub["predicted_id"])/float(len(train_sub))

0.5343398715301675


### Accuracy = 0.5343
* n_grams=[3] 
* dim =100
* num_trees_annoy=200

Total: 100000
Correct Predictions 0.372670
Incorrect Predictions 0.627330




### Generate Test predictions

In [63]:
test = pd.read_csv(data_path+"STest.csv", delimiter=",")
test.tail()

Unnamed: 0,test_index,name
99995,99995,Spó�jama�ka z ograniczon� Investmexnt� odpowiedzialnością
99996,99996,Banbrico LLC Limited
99997,99997,The NF
99998,99998,OLZ Holding - EWD Equity (ex CH)
99999,99999,VÆ299 99-NGE


In [64]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
test_index    100000 non-null int64
name          99999 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [65]:
test.isnull().sum()

test_index    0
name          1
dtype: int64

In [66]:
#### Test set has missing names as well as incorrectly encoded names. We drp these

In [67]:
test = test.loc[test["name"].notnull()]

In [68]:
test = name_pipeline.transform(test)

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


AttributeError: 'NoneType' object has no attribute 'split'

In [69]:
test.isnull().sum()

test_index    0   
name          0   
clean_name    2138
dtype: int64

In [70]:
test.loc[test["clean_name"].isnull()]

Unnamed: 0,test_index,name,clean_name
120,120,ASSISTEC SOCIETATE CU RL�,
124,124,RAMEXA ll� Hold ings TermékelőáIpari�s Zárűen működő rétKrs�nytársaság,
184,184,WARSAW GAS TRADING SPÓŁKA Z OGRAN.� ODPOWIEDZIALNOŚCIĄ,
191,191,Eurogalva - Galvanização e Metal.�nia S.A.,
208,208,"Chemgas Schiffahrts UG (haftungsbeschr�nkt) & Co. MT ""GASCHEM RHONE"" KG",
232,232,"""M"" S FÉRMMR� KORLÁTOLT FELELŐSSÉGŰ TÁRSASÁG",
237,237,Harald FUND Ullman Förs�k ringsrätt AB,
248,248,Arionn Sp. z o.o. 3 Spó� Fund�ka koman dytowo-akcyjna,
283,283,MA rs� & FöKonsult�ljning Equity AB,
320,320,Budap. Alap�r Rövid Kötvény Doll�,


In [71]:
test = test.loc[test["clean_name"].notnull()]

In [72]:
test["n_grams"] = make_corpus(test["clean_name"], N_GRAMS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [73]:
%%time
test_vectors = make_name_vectors(test["n_grams"],starspace_embeddings)
test_neighbours, test_distances = get_nearest_neighbours(test_vectors)
test_neigbours =np.vectorize(lambda x: annoy2company[x])(test_neighbours)

CPU times: user 25.6 s, sys: 43.9 ms, total: 25.7 s
Wall time: 25.7 s


In [74]:
print test_vectors.shape

(97861, 100)


In [75]:
print test_neighbours

[[ 78025]
 [445548]
 [174579]
 ...
 [ 82655]
 [370937]
 [ 62907]]


#### Set nearest neighbour to be the prediction

In [76]:
test["predicted_id"] = test_neigbours[:,0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
test= test.merge(companies[['company_id', 'name']], left_on="predicted_id", 
                   right_on="company_id", suffixes=("","_predicted"))

In [78]:
test

Unnamed: 0,test_index,name,clean_name,n_grams,predicted_id,company_id,name_predicted
0,0,THEking'S ROYAL HUSSARS OFFI. TRUST' TRUST,thekings royal hussars offi trust trust,the hek eki kin ing ngs gs_ s_r _ro roy oya yal al_ l_h _hu hus uss ssa sar ars rs_ s_o _of off ffi fi_ i_t _tr tru rus ust st_ t_t _tr tru rus ust,412751,412751,THE M&D TODD TRUST
1,1,Southern Powe rcompany SICAV,southern powe rcompany sicav,sou out uth the her ern rn_ n_p _po pow owe we_ e_r _rc rco com omp mpa pan any ny_ y_s _si sic ica cav,414072,414072,Southern Power Company
2,2,BMO S&P/TSX Ladde. Share ETF Index,bmo sp tsx ladde share etf index,bmo mo_ o_s _sp sp_ p_t _ts tsx sx_ x_l _la lad add dde de_ e_s _sh sha har are re_ e_e _et etf tf_ f_i _in ind nde dex,595336,595336,iShares V Public Limited Company - iShares S&P 500 Health Care Sector UCITS ETF USD (Acc)
3,12512,ProShares S&P 500 ex-Health Care ETF,proshares sp 500 ex health care etf,pro ros osh sha har are res es_ s_s _sp sp_ p_5 _50 500 00_ 0_e _ex ex_ x_h _he hea eal alt lth th_ h_c _ca car are re_ e_e _et etf,595336,595336,iShares V Public Limited Company - iShares S&P 500 Health Care Sector UCITS ETF USD (Acc)
4,3,PaI,pai,pai,108681,108681,UNTL
5,21903,uNTL,untl,unt ntl,108681,108681,UNTL
6,4,Clearview Two,clearview two,cle lea ear arv rvi vie iew ew_ w_t _tw two,563173,563173,Clearview Two
7,5,"CARTERA OkPTIMA MODERADA, FI",cartera okptima moderada fi,car art rte ter era ra_ a_o _ok okp kpt pti tim ima ma_ a_m _mo mod ode der era rad ada da_ a__ __f _fi,500381,500381,"CARTERA OPTIMA MODERADA, FI"
8,6,HOBART 85 LIMITED GmbH,hobart 85 limited gmbh,hob oba bar art rt_ t_8 _85 85_ 5_l _li lim imi mit ite ted ed_ d_g _gm gmb mbh,441687,441687,MRT GmbH
9,7,SOC MAX MODEL,soc max model,soc oc_ c_m _ma max ax_ x_m _mo mod ode del,440339,440339,SOC MAX MODEL


#### Save test predictions

In [79]:
test[["test_index", "company_id"]].to_csv(data_path+"test_predictions.csv", sep="|", index=False)

In [80]:
test.shape

(97861, 7)

That is all!