In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn import svm, linear_model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc,f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
import unicodedata
import random
random.seed(37)
import gensim
from gensim.models import Word2Vec
from annoy import AnnoyIndex

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
path = "/home/akash/projects/data_analysis/company_name_matching/data/"

In [4]:
companies = pd.read_csv(path+"G.csv", delimiter="|")
train = pd.read_csv(path+"STrain.csv", delimiter="|")

In [5]:
companies.head()

Unnamed: 0,company_id,name
0,634022,PRIMCOM SA
1,324497,The David Isaacs Fund
2,280848,Bramor Enterprises Limited
3,432662,NAVEXIM S.A.
4,524224,Magal Group SA


In [6]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450256 entries, 0 to 450255
Data columns (total 2 columns):
company_id    450256 non-null int64
name          450256 non-null object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB


In [7]:
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
train_index    100000 non-null int64
name           100000 non-null object
company_id     100000 non-null int64
dtypes: int64(2), object(1)
memory usage: 2.3+ MB


In [9]:
test = pd.read_csv(path+"STest.csv", delimiter=",")
test.head()

Unnamed: 0,test_index,name
0,0,THEking'S ROYAL HUSSARS OFFI. TRUST' TRUST
1,1,Southern Powe rcompany SICAV
2,2,BMO S&P/TSX Ladde. Share ETF Index
3,3,PaI
4,4,Clearview Two


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
test_index    100000 non-null int64
name          99999 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [11]:
print train.columns
print test.columns

Index([u'train_index', u'name', u'company_id'], dtype='object')
Index([u'test_index', u'name'], dtype='object')


In [12]:
print train.shape
print train["name"].nunique()

(100000, 3)
99365


In [13]:
print test.shape
print test["name"].nunique()

(100000, 2)
99323


In [14]:
print len(set(test["name"]) & set(train["name"]))

1075


In [15]:
companies = companies.reset_index(drop=True)
companies.head()

Unnamed: 0,company_id,name
0,634022,PRIMCOM SA
1,324497,The David Isaacs Fund
2,280848,Bramor Enterprises Limited
3,432662,NAVEXIM S.A.
4,524224,Magal Group SA


#### Sample Data

In [16]:
# orig_size = len(companies)
# sample_size = 200000 
# companies = companies.sample(sample_size)

In [17]:
# train_0 = train.loc[train["company_id"]==-1]
# print train_0.shape

# train_1 = train.loc[train["company_id"]>=0]
# print train_1.shape

In [18]:
# train = train_1.loc[train_1["company_id"].isin(companies["company_id"])]
# print train.shape

In [19]:
# train0_sample = train_0.sample(int(sample_size/float(orig_size)*len(train_0)))
# print len(train0_sample)

In [20]:
# train = pd.concat([train,train0_sample])

In [21]:
train.shape

(100000, 3)

In [22]:
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


#### Make map of company_ids to Annoy_index_ids

In [23]:
# annoy_ids are from 0 to len(company_ids) -1 
# annoy_ids are stored in list. List index are annoy_ids and the value is teh correspodnign company_id
# Company_ids are stored in a dict. The key is company id and the value is annoy_id
def build_company_annoy_maps(company_ids):
    annoy2company = []
    company2annoy = defaultdict(lambda:0)
    for c_id in company_ids:
        if c_id not in company2annoy:
            annoy2company.append(c_id)
            company2annoy[c_id] = len(annoy2company)-1
    return annoy2company, company2annoy  
        

In [24]:
annoy2company, company2annoy = build_company_annoy_maps(companies["company_id"].values)

In [25]:
print len(annoy2company)
print len(company2annoy)

450256
450256


In [26]:
import random
for i in range(5):
    annoy_id = random.randint(0,50000)
    assert annoy_id ==  company2annoy[annoy2company[annoy_id]]
    print annoy_id, annoy2company[annoy_id], company2annoy[annoy2company[annoy_id]]

34100 198710 34100
4580 484544 4580
30891 320028 30891
42096 89805 42096
41728 232717 41728


In [27]:
def remove_accents(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    def remove_accents_inner(input_str):
        nfkd_form = unicodedata.normalize('NFKD', unicode(input_str, 'utf8'))
        return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    
    df[new_col] = df[old_col].apply(remove_accents_inner)
    return df

def make_accent_transformer(old_col, new_col):
    return FunctionTransformer(remove_accents, validate=False,
                                         kw_args={"old_col":old_col,"new_col":new_col})

In [28]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/akash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]|@,;\-\\\+#~!$%^]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STOPWORDS = set(stopwords.words('english'))
#-#+_!`~\\

In [30]:
s = "for from india and add+g-ddc.b.g,c\g-hh ak|ash#sin/gh dee$pt^a su%d"
s=re.sub(REPLACE_BY_SPACE_RE," ",s)
s= re.sub(BAD_SYMBOLS_RE,"",s)
s= ' '.join(word for word in s.split() if word not in STOPWORDS)
print s

india add g ddcbg c g hh ak ash sin gh dee pt su


In [31]:
def clean_name(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    
    def regex_clean(text):
        text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
        text = re.sub(BAD_SYMBOLS_RE, "", text)
#         #add shorthand
#         longhand = ' '.join(word for word in text.split() if not any(c.isdigit() for c in word))
#         shorthand = "".join(word[0] for word in re.findall("\w+", longhand))
#         text = text+" "+shorthand
        return text

    def strip_extra_spaces(text):
        return " ".join(text.split())
    
    df[new_col] = df[old_col].str.lower().str.strip()
    df[new_col] = df[new_col].apply(regex_clean)
    df[new_col] = df[new_col].apply(strip_extra_spaces)
    return df

def make_clean_name_transformer(old_col, new_col):
    return FunctionTransformer(clean_name, validate=False, kw_args={"old_col":old_col,"new_col":new_col})

In [32]:
name_pipeline = make_pipeline(make_accent_transformer("name", "clean_name"),
                                    make_clean_name_transformer("clean_name", "clean_name"))
companies = name_pipeline.transform(companies)

In [33]:
companies.sample(10)

Unnamed: 0,company_id,name,clean_name
116022,531186,EVI East Grand Forks LLC,evi east grand forks llc
281337,318011,Linden Logistics (Lux) S.à r.l.,linden logistics lux sa rl
348395,612231,Boryszewo Wind Invest spółka z ograniczoną odpowiedzialnością,boryszewo wind invest spoka z ograniczona odpowiedzialnoscia
368424,82576,BASF Coatings Services S.A.,basf coatings services sa
242055,28080,PREFA PRAHA a.s.,prefa praha as
51627,570672,Gesellschaft für staatsbürgerliche Bildung Saar mbH,gesellschaft fur staatsburgerliche bildung saar mbh
144787,108161,Fidelity Funds - Solutions SMART Defensive Pool,fidelity funds solutions smart defensive pool
145193,264180,"HUNTSMAN P&A SPAIN, S.L",huntsman pa spain sl
243976,408730,LOGICOR (CURVE) WE EINS GMBH,logicor curve we eins gmbh
299347,602125,Graben 19 Immobilien GmbH,graben 19 immobilien gmbh


In [34]:
print companies.shape
print companies["clean_name"].nunique()

(450256, 3)
448050


In [35]:
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


In [36]:
print train.shape
train.sample(10)

(100000, 3)


Unnamed: 0,train_index,name,company_id
87480,87480,Aberd een GA Property Share Fdund,514345
83155,83155,Inter. Capital SEC,-1
89655,89655,"VM, LC Holdings",-1
61920,61920,HIDROTECNICA IMMOBILIARE SRL,-1
91770,91770,ASSETalliane LIMITED LEASING,368975
12387,12387,"RESOURCES DAYTALspain, SL",161446
86495,86495,Landeskirche/LB AM 95 ZVK NROD Hanno.,380644
27839,27839,Jona,627708
93834,93834,"300 Rke Drive, LLC",-1
76313,76313,VantageTrust Master Coll ectiveiii Funds Trust - VT Vantagepoint Emerging Markets Fund,-1


In [37]:
from collections import Counter
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1)
# X matrix where the row represents sentences and column is our one-hot vector for each token in our vocabulary
X = count_vectorizer.fit_transform(companies["clean_name"])

# Vocabulary
vocab = list(count_vectorizer.get_feature_names())

# Column-wise sum of the X matrix.
# It's some crazy numpy syntax that looks horribly unpythonic
# For details, see http://stackoverflow.com/questions/3337301/numpy-matrix-to-array
# and http://stackoverflow.com/questions/13567345/how-to-calculate-the-sum-of-all-columns-of-a-2d-numpy-array-efficiently
counts = X.sum(axis=0).A1

freq_distribution = Counter(dict(zip(vocab, counts)))

In [38]:
print (freq_distribution.most_common(100))

[(u'llc', 44288), (u'fund', 38551), (u'limited', 34803), (u'gmbh', 29272), (u'inc', 24952), (u'bv', 21019), (u'srl', 20646), (u'trust', 20440), (u'ltd', 16548), (u'sa', 15202), (u'co', 12984), (u'lp', 11681), (u'global', 11272), (u'de', 10292), (u'the', 10220), (u'company', 10159), (u'funds', 9906), (u'kg', 9865), (u'holding', 9770), (u'ab', 9459), (u'investment', 8335), (u'sl', 7666), (u'capital', 7485), (u'international', 7283), (u'equity', 6886), (u'di', 6700), (u'holdings', 6683), (u'of', 6680), (u'spa', 6663), (u'group', 6212), (u'as', 6092), (u'ii', 6061), (u'sicav', 5883), (u'management', 5723), (u'investments', 5456), (u'aps', 5192), (u'portfolio', 4959), (u'bond', 4854), (u'partners', 4848), (u'societa', 4605), (u'invest', 4526), (u'and', 4211), (u'oy', 4152), (u'sro', 4143), (u'bank', 3981), (u'plc', 3861), (u'services', 3851), (u'income', 3848), (u'master', 3799), (u'corporation', 3747), (u'nv', 3573), (u'beheer', 3568), (u'mbh', 3509), (u'spoka', 3506), (u'ag', 3479), (u'se

In [39]:
train = name_pipeline.transform(train)

In [40]:
train.sample(10)

Unnamed: 0,train_index,name,company_id,clean_name
59665,59665,ASH FARMS LD.,432998,ash farms ld
9267,9267,SCAT,519237,scat
68974,68974,Summit Real-Estate Omega GmbH,313570,summit real estate omega gmbh
66352,66352,CEGELEC EE,390949,cegelec ee
52567,52567,Indium V (Mauritius) Hold ings Limited TRUST,419594,indium v mauritius hold ings limited trust
96532,96532,2250 Fourth Avenue Fund Partnership,147223,2250 fourth avenue fund partnership
76669,76669,Belfius Select Portf olio Capital SA - World Balanced 40,80502,belfius select portf olio capital sa world balanced 40
17103,17103,"Wildwood FUND IP, LLC",-1,wildwood fund ip llc
24518,24518,TFC,426935,tfc
59208,59208,Wolk Holdings Aktiengesellschaft,1666,wolk holdings aktiengesellschaft


In [41]:
import gensim
assert gensim.models.doc2vec.FAST_VERSION > -1

In [42]:
companies.head(10)

Unnamed: 0,company_id,name,clean_name
0,634022,PRIMCOM SA,primcom sa
1,324497,The David Isaacs Fund,the david isaacs fund
2,280848,Bramor Enterprises Limited,bramor enterprises limited
3,432662,NAVEXIM S.A.,navexim sa
4,524224,Magal Group SA,magal group sa
5,513585,Marly SPF S.A.,marly spf sa
6,354496,I.T APPARELS LIMITED,it apparels limited
7,381944,VX 30.141 ApS,vx 30141 aps
8,526057,Rydex ETF Trust - Guggenheim S&P 500 Equal Weight ETF,rydex etf trust guggenheim sp 500 equal weight etf
9,34381,Rydex Series Funds - Retailing Fund,rydex series funds retailing fund


In [43]:
companies.loc[companies["company_id"] == 526057,"clean_name"]

8    rydex etf trust guggenheim sp 500 equal weight etf
Name: clean_name, dtype: object

In [44]:
companies.clean_name.values[0:9]

array([u'primcom sa', u'the david isaacs fund',
       u'bramor enterprises limited', u'navexim sa', u'magal group sa',
       u'marly spf sa', u'it apparels limited', u'vx 30141 aps',
       u'rydex etf trust guggenheim sp 500 equal weight etf'],
      dtype=object)

In [45]:
b='student ddg dfs'
n=4
print [b[i:i+n] for i in range(len(b)-n+1)]

['stud', 'tude', 'uden', 'dent', 'ent ', 'nt d', 't dd', ' ddg', 'ddg ', 'dg d', 'g df', ' dfs']


#### SETTINGS

In [46]:
n_grams=[2,3]
dim = 400
window =5
min_count=5
workers=4
num_trees_annoy=300

In [None]:
def make_corpus(names, n_grams= [3]):
    corpus = []
    for name in names:
        name_grams = []
        for n in n_grams:
            name_grams.extend([name[i:i+n] for i in range(len(name)-n+1)])
        corpus.append(name_grams)
    return corpus

In [None]:
def make_name_vectors(corpus):
    name_vectors = np.empty(shape=(len(corpus),dim))
    for idx, name in enumerate(corpus):
        name_vector = name_to_vec(name, model.wv)
        name_vectors[idx,:] = name_vector
    return name_vectors

In [None]:
def name_to_vec(name, embeddings, dim=dim):
    """
        name: a string
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation

        result: vector representation for the question
    """
    
    result = np.zeros(shape=dim)
    count = 0
    for word in name:
        if word in embeddings:
            result = result + embeddings[word]
            count+=1
    
    if count>0:
        result = result/count
        
    return result

In [None]:
company_corpus = make_corpus(companies["clean_name"], n_grams)

In [None]:
len(company_corpus)

In [None]:
companies.head()

In [None]:
company_corpus[0:5]

In [None]:
print gensim.models.doc2vec.FAST_VERSION

In [None]:
model = Word2Vec(company_corpus, size=dim, window=window, min_count=min_count, workers=workers)
model.wv

In [None]:
model.wv

In [None]:
model.wv["llc"].size

In [None]:
company_vectors = make_name_vectors(company_corpus)

In [None]:
company_vectors[0:5]

In [None]:
# index_size = dim
# table = AnnoyIndex(index_size)

# for i in range(company_vectors.shape[0]):
#     if i%20000==0:
#         print "indexed %s items"%i
#     table.add_item(i,company_vectors[i])

In [None]:
# table.build(num_trees_annoy)

In [None]:
# table.save("annoy_w2vec_index.ann")

In [None]:
table = AnnoyIndex(index_size)
table.load("annoy_w2vec_index.ann")

#### Reteiveal

In [None]:
train = train.merge(companies, how="left", on="company_id", suffixes=('','_truth'))

In [None]:
train.head()

In [None]:
train_corpus = make_corpus(train["clean_name"], n_grams)

In [None]:
train_corpus[0:5]

In [None]:
#train_vectors = tfidf_vectorizer.transform(train["clean_name"])
train_vectors = make_name_vectors(train_corpus)

In [None]:
train_vectors[0:5]

In [None]:
def get_nearest_neighbours(vectors, num_neighbours=1, search_nodes=-1, include_distances=True):
    neighbours = np.empty(shape=(len(vectors),num_neighbours), dtype=np.int32)
    distances = np.empty(shape=(len(vectors),num_neighbours))
#     print len(vectors)
#     print neighbours.shape
#     print distances.shape
    for idx,v in enumerate(vectors):
        annoy_ids, annoy_distances = table.get_nns_by_vector(v, n=num_neighbours, search_k=search_nodes, include_distances=include_distances)
        neighbours[idx,:] = annoy_ids
        distances[idx,:] = annoy_distances
    return neighbours, distances

In [None]:
neighbours, distances = get_nearest_neighbours(train_vectors)

In [None]:
neigbours =np.vectorize(lambda x: annoy2company[x])(neighbours)

In [None]:
rank_1 = neigbours[:,0]
rank_1

In [None]:
train["rank1_id"] = rank_1

In [None]:
train= train.merge(companies[['company_id', 'clean_name']], left_on="rank1_id", 
                   right_on="company_id", suffixes=("","_rank1"))

In [None]:
train

In [None]:
print len(train)
print train.loc[train["company_id"]==train["rank1_id"]].shape
print train.loc[train["company_id"]!=train["rank1_id"]].shape

In [None]:
train_sub = train.loc[train["company_id"]!=-1]
print np.sum(train_sub["company_id"] == train_sub["rank1_id"])/float(len(train_sub))

##### Accuracy = 0.5219
* n_grams=[2,3] 
* dim =300
* window =5
* min_count=5
* workers=4
* num_trees_annoy=300




In [None]:
#### Process Test

In [None]:
test.head()

In [None]:
test = name_pipeline.transform(test)