In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn import svm, linear_model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc,f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
import unicodedata
import random
random.seed(37)

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
path = "/home/akash/projects/data_analysis/ing/data/"

In [4]:
companies = pd.read_csv(path+"G.csv", delimiter="|")
train = pd.read_csv(path+"STrain.csv", delimiter="|")

In [5]:
companies.head()

Unnamed: 0,company_id,name
0,634022,PRIMCOM SA
1,324497,The David Isaacs Fund
2,280848,Bramor Enterprises Limited
3,432662,NAVEXIM S.A.
4,524224,Magal Group SA


In [6]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450256 entries, 0 to 450255
Data columns (total 2 columns):
company_id    450256 non-null int64
name          450256 non-null object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB


In [7]:
train.head()

Unnamed: 0,train_index,name,company_id
0,0,TRATTAMENTO Ltd RIFIUTI METROPOLITANI SPA SIGLABILE TRM SPA,177358
1,1,A IRL Fuund,568472
2,2,BMR-500 Kendall LLC 1 Mezz GmbH,195692
3,3,Solich GmbH KG,-1
4,4,Drzyzzga Funds Logi. sp. z oo,404178


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
train_index    100000 non-null int64
name           100000 non-null object
company_id     100000 non-null int64
dtypes: int64(2), object(1)
memory usage: 2.3+ MB


In [9]:
test = pd.read_csv(path+"STest.csv", delimiter=",")
test.head()

Unnamed: 0,test_index,name
0,0,THEking'S ROYAL HUSSARS OFFI. TRUST' TRUST
1,1,Southern Powe rcompany SICAV
2,2,BMO S&P/TSX Ladde. Share ETF Index
3,3,PaI
4,4,Clearview Two


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
test_index    100000 non-null int64
name          99999 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [11]:
print train.columns
print test.columns

Index([u'train_index', u'name', u'company_id'], dtype='object')
Index([u'test_index', u'name'], dtype='object')


In [12]:
print train.shape
print train["name"].nunique()

(100000, 3)
99365


In [13]:
print test.shape
print test["name"].nunique()

(100000, 2)
99323


In [14]:
print len(set(test["name"]) & set(train["name"]))

1075


#### Sample Data

In [15]:
companies = companies.sample(50000)

In [16]:
train_0 = train.loc[train["company_id"]==-1]
print train_0.shape
train_1 = train.loc[train["company_id"]>=0]

print train_1.shape

(30256, 3)
(69744, 3)


In [17]:
train = train_1.loc[train_1["company_id"].isin(companies["company_id"])]
print train.shape

(7828, 3)


In [18]:
companies = companies.reset_index(drop=True)
companies.head()

Unnamed: 0,company_id,name
0,397803,Conforma NV
1,1036,Morgan Stanley Investment Funds - Emerging Markets Fixed Income Opportunities Fund
2,545357,Eurolease Group EAD
3,413401,Merum Vastgoed Sittard B.V.
4,142020,Coloramo Federal Credit Union


In [19]:
train0_sample = train_0.sample(int((1/9.0)*len(train_0)))
print len(train0_sample)

3361


In [20]:
train = pd.concat([train,train0_sample])

In [21]:
train.shape

(11189, 3)

In [22]:
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,train_index,name,company_id
0,9,RBPA Leeuw,526042
1,12,"CP X, spol. s r.o.",139980
2,14,Dansk Socialrådgiverforening,179426
3,31,"Central Milling Holdi ngs,",202295
4,36,GTA (LIMITED LLC,144245


#### Make map of company_ids to Annoy_index_ids

In [23]:
# annoy_ids are from 0 to len(company_ids) -1 
# annoy_ids are stored in list. List index are annoy_ids and the value is teh correspodnign company_id
# Company_ids are stored in a dict. The key is company id and the value is annoy_id
def build_company_annoy_maps(company_ids):
    annoy2company = []
    company2annoy = defaultdict(lambda:0)
    for c_id in company_ids:
        if c_id not in company2annoy:
            annoy2company.append(c_id)
            company2annoy[c_id] = len(annoy2company)-1
    return annoy2company, company2annoy  
        

In [24]:
annoy2company, company2annoy = build_company_annoy_maps(companies["company_id"].values)

In [25]:
print len(annoy2company)
print len(company2annoy)

50000
50000


In [26]:
import random
for i in range(5):
    annoy_id = random.randint(0,50000)
    assert annoy_id ==  company2annoy[annoy2company[annoy_id]]
    print annoy_id, annoy2company[annoy_id], company2annoy[annoy2company[annoy_id]]

34100 241706 34100
4580 443259 4580
30891 306121 30891
42096 521034 42096
41728 639198 41728


In [27]:
def remove_accents(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    def remove_accents_inner(input_str):
        nfkd_form = unicodedata.normalize('NFKD', unicode(input_str, 'utf8'))
        return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    
    df[new_col] = df[old_col].apply(remove_accents_inner)
    return df

def make_accent_transformer(old_col, new_col):
    return FunctionTransformer(remove_accents, validate=False,
                                         kw_args={"old_col":old_col,"new_col":new_col})

In [28]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/akash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]|@,;\-\\\+#~!$%^]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STOPWORDS = set(stopwords.words('english'))
#-#+_!`~\\

In [30]:
s = "for from india and add+g-ddc.b.g,c\g-hh ak|ash#sin/gh dee$pt^a su%d"
s=re.sub(REPLACE_BY_SPACE_RE," ",s)
s= re.sub(BAD_SYMBOLS_RE,"",s)
s= ' '.join(word for word in s.split() if word not in STOPWORDS)
print s

india add g ddcbg c g hh ak ash sin gh dee pt su


In [31]:
def clean_name(df,**kw_args):
    old_col =  kw_args["old_col"]
    new_col =  kw_args["new_col"]
    
    def regex_clean(text):
        text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
        text = re.sub(BAD_SYMBOLS_RE, "", text)
        #text= ' '.join(word for word in text.split() if word not in STOPWORDS)
        return text
    
    df[new_col] = df[old_col].str.lower().str.strip()
    df[new_col] = df[old_col].apply(regex_clean)
    
    return df

def make_clean_name_transformer(old_col, new_col):
    return FunctionTransformer(clean_name, validate=False, kw_args={"old_col":old_col,"new_col":new_col})

In [32]:
name_pipeline = make_pipeline(make_accent_transformer("name", "clean_name"),
                                    make_clean_name_transformer("clean_name", "clean_name"))
companies = name_pipeline.transform(companies)

In [33]:
companies.sample(10)

Unnamed: 0,company_id,name,clean_name
3569,83139,Hartley & Marks Publishers Inc.,hartley marks publishers inc
16207,311669,Calico West Hartford LLC,calico west hartford llc
34759,303768,Tresmontes Lucchetti S.A.,tresmontes lucchetti sa
22619,387575,"Shani Realty, LLC",shani realty llc
45940,264764,"BKS International World Wide Investments, Inc",bks international world wide investments inc
39452,519410,LABORATORIO POLO SRL,laboratorio polo srl
16392,117171,Evolution M 7,evolution m 7
33034,404318,Exklusiv Portfolio SICAV - Ertrag,exklusiv portfolio sicav ertrag
39503,198263,PALATINE MONETAIRE,palatine monetaire
42731,514685,St Job Rotterdam C.V.,st job rotterdam cv


In [34]:
print companies.shape
print companies["clean_name"].nunique()

(50000, 3)
49955


In [35]:
train.head()

Unnamed: 0,train_index,name,company_id
0,9,RBPA Leeuw,526042
1,12,"CP X, spol. s r.o.",139980
2,14,Dansk Socialrådgiverforening,179426
3,31,"Central Milling Holdi ngs,",202295
4,36,GTA (LIMITED LLC,144245


In [36]:
print train.shape
train.sample(10)

(11189, 3)


Unnamed: 0,train_index,name,company_id
8415,71832,db x-trac. Global - Euro Stoxx 50® ex Financials UCITS ETF (DR),-1
5472,70023,Fruktimporten Stock holm Capital AB,116782
3779,48227,HC mCL,568979
5240,67054,HK VODA BELAood Inter.,217936
4322,55481,Schoellerbank Euroalternativ,28329
1774,22182,KE Wiborg Holding Capital ApS,433285
7725,98793,Horizons EIEE,525424
9888,69241,SIgAV UEME LLC,-1
7381,94716,unctum Saliens BV,250026
7071,90756,442 Lorimer LsLC Street,503772


In [37]:
from collections import Counter
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1)
# X matrix where the row represents sentences and column is our one-hot vector for each token in our vocabulary
X = count_vectorizer.fit_transform(companies["clean_name"])

# Vocabulary
vocab = list(count_vectorizer.get_feature_names())

# Column-wise sum of the X matrix.
# It's some crazy numpy syntax that looks horribly unpythonic
# For details, see http://stackoverflow.com/questions/3337301/numpy-matrix-to-array
# and http://stackoverflow.com/questions/13567345/how-to-calculate-the-sum-of-all-columns-of-a-2d-numpy-array-efficiently
counts = X.sum(axis=0).A1

freq_distribution = Counter(dict(zip(vocab, counts)))

In [38]:
print (freq_distribution.most_common(100))

[(u'llc', 4888), (u'fund', 4321), (u'limited', 3877), (u'gmbh', 3253), (u'inc', 2859), (u'srl', 2348), (u'bv', 2311), (u'trust', 2259), (u'ltd', 1759), (u'sa', 1622), (u'co', 1385), (u'lp', 1247), (u'global', 1239), (u'company', 1168), (u'the', 1164), (u'holding', 1113), (u'de', 1082), (u'kg', 1082), (u'funds', 1074), (u'ab', 1050), (u'investment', 988), (u'sl', 855), (u'international', 831), (u'capital', 813), (u'spa', 747), (u'of', 741), (u'di', 731), (u'holdings', 728), (u'equity', 725), (u'ii', 696), (u'group', 681), (u'as', 678), (u'sicav', 664), (u'investments', 625), (u'management', 612), (u'aps', 599), (u'societa', 573), (u'bond', 554), (u'portfolio', 539), (u'partners', 537), (u'invest', 512), (u'oy', 490), (u'bank', 488), (u'and', 466), (u'corporation', 443), (u'master', 441), (u'sro', 429), (u'income', 426), (u'plc', 421), (u'mbh', 415), (u'settlement', 398), (u'nv', 391), (u'europe', 391), (u'ag', 390), (u'beheer', 389), (u'spoka', 387), (u'services', 381), (u'etf', 365), (

In [39]:
train = name_pipeline.transform(train)

In [40]:
train.sample(10)

Unnamed: 0,train_index,name,company_id,clean_name
1945,24251,Fack. GmbH + Co KG,417042,fack gmbh co kg
7099,91063,GUAPULO Funds TRUST,178505,guapulo funds trust
7,78,Limited Shevo,102540,limited shevo
2456,31140,HOLDINGSELSKABET AF 31. MARTS Holdings 2003 ApS,180908,holdingselskabet af 31 marts holdings 2003 aps
4557,58359,"IN, C. por SgICAV A.",179184,in c por sgicav a
5682,72686,MELF S.à r.l.,212941,melf sa rl
3281,41585,"HE 6095, LP",323483,he 6095 lp
1591,19831,Believa GmbH,458164,believa gmbh
3967,50757,HVP Ligting NV,398740,hvp ligting nv
7471,95807,BANCA VALDICHIANA CREDITO COOPE RATIVO TOSCO Funds - UMBRO - SOCIETA' COOPERATIVA,349816,banca valdichiana credito coope rativo tosco funds umbro societa cooperativa


In [41]:
tfidf_vectorizer = TfidfVectorizer( max_df=0.8, ngram_range=(2,3), analyzer="char",
                                       token_pattern='(\S+)', )

In [42]:
company_vectors = tfidf_vectorizer.fit_transform(companies["clean_name"])

In [43]:
type(company_vectors)

scipy.sparse.csr.csr_matrix

In [44]:
company_vectors.shape

(50000, 15215)

In [45]:
companies.shape

(50000, 3)

In [46]:
for i in range(5):
    print tfidf_vectorizer.inverse_transform(company_vectors[i])
    c_id = companies.iloc[i]["company_id"]
    print companies.iloc[i]["name"], c_id, company2annoy[c_id], annoy2company[company2annoy[c_id]]
    
    print "-"*20

[array([u'co', u'on', u'nf', u'fo', u'or', u'rm', u'ma', u'a ', u' n',
       u'nv', u'con', u'onf', u'nfo', u'for', u'orm', u'rma', u'ma ',
       u'a n', u' nv'], dtype='<U3')]
Conforma NV 397803 0 397803
--------------------
[array([u'co', u'or', u'ma', u'nv', u'mo', u'rg', u'ga', u'an', u'n ',
       u' s', u'st', u'ta', u'nl', u'le', u'ey', u'y ', u' i', u'in',
       u've', u'es', u'tm', u'me', u'en', u'nt', u't ', u' f', u'fu',
       u'un', u'nd', u'ds', u's ', u' e', u'em', u'er', u'gi', u'ng',
       u'g ', u' m', u'ar', u'rk', u'ke', u'et', u'ts', u'fi', u'ix',
       u'xe', u'ed', u'd ', u'nc', u'om', u'e ', u' o', u'op', u'pp',
       u'po', u'rt', u'tu', u'ni', u'it', u'ti', u'ie', u'mor', u'org',
       u'rga', u'gan', u'an ', u'n s', u' st', u'sta', u'tan', u'anl',
       u'nle', u'ley', u'ey ', u'y i', u' in', u'inv', u'nve', u'ves',
       u'est', u'stm', u'tme', u'men', u'ent', u'nt ', u't f', u' fu',
       u'fun', u'und', u'nds', u'ds ', u's e', u' em', u'eme', u'm

In [47]:
companies.tail()

Unnamed: 0,company_id,name,clean_name
49995,66919,"R.C. Bigelow, Inc.",rc bigelow inc
49996,487440,"AMP Capital Infrastructure Debt Fund III (USD), LP",amp capital infrastructure debt fund iii usd lp
49997,316479,"Real Torino, Inc.",real torino inc
49998,80413,"GS Legacy Crossing, LLC",gs legacy crossing llc
49999,399901,"""NES 2005 S.R.L.""",nes 2005 srl


In [48]:
annoy2company[49999]

399901

In [49]:
type(company_vectors[0].todense())
print np.squeeze(np.asarray(company_vectors[0].todense())).shape

(15215,)


In [50]:
from annoy import AnnoyIndex

index_size = len(tfidf_vectorizer.vocabulary_)
table = AnnoyIndex(index_size)

for i in range(company_vectors.shape[0]):
    if i%10000==0:
        print "indexed %s items"%i
    table.add_item(i,np.squeeze(np.asarray(company_vectors[i].todense())))

indexed 0 items
indexed 10000 items
indexed 20000 items
indexed 30000 items
indexed 40000 items


In [51]:
table.build(100)

True

In [52]:
table.save("annoy_index.ann")

True

In [53]:
table = AnnoyIndex(index_size)
table.load("annoy_index.ann")

True

#### Reteiveal

In [54]:
train = train.merge(companies, how="left", on="company_id", suffixes=('','_truth'))

In [55]:
train_vectors = tfidf_vectorizer.transform(train["clean_name"])

In [56]:
train_vectors_dense = np.squeeze(np.asarray(train_vectors.todense()))

In [57]:
train_vectors_dense.shape

(11189, 15215)

In [58]:
def get_nearest_neighbours(vectors, num_neighbours=5, search_nodes=-1, include_distances=True):
    neighbours = np.empty(shape=(len(vectors),num_neighbours), dtype=np.int32)
    distances = np.empty(shape=(len(vectors),num_neighbours))
#     print len(vectors)
#     print neighbours.shape
#     print distances.shape
    for idx,v in enumerate(vectors):
        annoy_ids, annoy_distances = table.get_nns_by_vector(v, n=num_neighbours, search_k=search_nodes, include_distances=include_distances)
        neighbours[idx,:] = annoy_ids
        distances[idx,:] = annoy_distances
    return neighbours, distances

In [59]:
neighbours, distances = get_nearest_neighbours(train_vectors_dense)

In [60]:
neighbours

array([[44764, 19848, 42571, 32042, 23728],
       [33988, 31936, 39184, 33960, 17123],
       [  583, 27741, 12374, 45411, 39815],
       ...,
       [21071, 10158, 13313, 10123, 13075],
       [26232, 18286, 44991,  7696,  7870],
       [  972, 18530, 29467, 31200,  8931]], dtype=int32)

In [61]:
neigbours =np.vectorize(lambda x: annoy2company[x])(neighbours)

In [62]:
rank_1 = neigbours[:,0]
rank_1

array([526042, 115370, 179426, ..., 641016,  77628, 255937])

In [63]:
train["rank1_id"] = rank_1

In [64]:
train= train.merge(companies[['company_id', 'clean_name']], left_on="rank1_id", 
                   right_on="company_id", suffixes=("","_rank1"))

In [65]:
train

Unnamed: 0,train_index,name,company_id,clean_name,name_truth,clean_name_truth,rank1_id,company_id_rank1,clean_name_rank1
0,9,RBPA Leeuw,526042,rbpa leeuw,RBPA Leeuw,rbpa leeuw,526042,526042,rbpa leeuw
1,12,"CP X, spol. s r.o.",139980,cp x spol s ro,"CTP Property X, spol. s r.o.",ctp property x spol s ro,115370,115370,ortex spol s ro
2,99957,"ORTEX , spol. s ro",115370,ortex spol s ro,"ORTEX , spol. s r.o.",ortex spol s ro,115370,115370,ortex spol s ro
3,14,Dansk Socialrådgiverforening,179426,dansk socialradgiverforening,Dansk Socialrådgiverforening,dansk socialradgiverforening,179426,179426,dansk socialradgiverforening
4,31,"Central Milling Holdi ngs,",202295,central milling holdi ngs,"Central Milling Holdings, Inc.",central milling holdings inc,202295,202295,central milling holdings inc
5,12088,"Central Milling Holdings, Inc.",202295,central milling holdings inc,"Central Milling Holdings, Inc.",central milling holdings inc,202295,202295,central milling holdings inc
6,34591,"Central Milling Holdi., Inc Funds.",202295,central milling holdi inc funds,"Central Milling Holdings, Inc.",central milling holdings inc,202295,202295,central milling holdings inc
7,36,GTA (LIMITED LLC,144245,gta limited llc,GTA (RETAIL) LIMITED,gta retail limited,144245,144245,gta retail limited
8,53,Bamboo Holdings Ltd,130929,bamboo holdings ltd,Bamboo Investment Holdings Ltd,bamboo investment holdings ltd,130929,130929,bamboo investment holdings ltd
9,71,"HPB, LC",342136,hpb lc,"HPB Investments, LLC",hpb investments llc,631015,631015,pb 11


In [66]:
train.loc[train["company_id"]==train["rank1_id"]].shape

(6053, 9)

In [67]:
train_sub = train.loc[train["company_id"]!=-1]

In [68]:
print np.sum(train_sub["company_id"] == train_sub["rank1_id"])/float(len(train_sub))

0.7732498722534492
