In [1]:
!pip install -q ftfy 

from ftfy import fix_text
import re, sys, os
import pandas as pd
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix



In [2]:
path = '/Users/berkekavak/alpas/'
data = pd.read_csv(path + 'ds_challenge_alpas.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,entity_1,entity_2,tag
0,3137667,preciform A.B,Preciform AB,1
1,5515816,degener staplertechnik vertriebs-gmbh,Irshim,0
2,215797,Alltel South CaroliNA Inc,alltel south carolina INC.,1
3,1004621,cse Corporation,Cse Corp,1
4,1698689,Gruppo D Motors Srl,gruppo d motors Sociedad de Resposabilidad Lim...,1


In [3]:
df_train = data.sample(100000, random_state=1).reset_index(drop=True)

In [4]:
df_train

Unnamed: 0.1,Unnamed: 0,entity_1,entity_2,tag
0,7752920,scherer automotive incorporated.,elektroshchit A.O,0
1,5148830,technical supplies international llc,c&t reinforcing steel,0
2,725115,Campania Motori SRL,campania motori,1
3,3358104,rockford manufacturing group,Rockford Manufacturing Group Inc,1
4,7443139,organik kimya netherlands,pro-line shipping )),0
...,...,...,...,...
99995,5825486,hazle auto parts,ferlog nsorzio stabile,0
99996,5281981,schlueter a.i.r Co.nditioning and refrigeration,Cherubini France,0
99997,7831741,shanghai rongtai health,ineos oxide LTD,0
99998,6374489,international shipping lines,k c discounts,0


## 3-gram Analyzer for the TF-IDF Vectorizer

In [5]:
# Text preprocessing for cleaning the text from unnecessary character, punctuation and encoding-decoding issues 

def ngrams(string, n=3):
    string = str(string)
    
    # Fixes text for any possible decoding issues
    string = fix_text(string)
    
    # Removes non ascii chars
    string = string.lower()
    string = string.encode("ascii", errors="ignore").decode()
    
    # Cleaning unrelevant characters
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() 
    string = re.sub(' +',' ',string).strip()
    string = ' '+ string +' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

## TF-IDF vectorization to obtain document-term matrix for the entity_1 column

In [6]:
entity_1 = df_train['entity_1'].unique()
entity_1 = entity_1.astype('U')

# Vectorizes the first entities, used ngrams function defined above for the analyzer
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
# Used fit_transform obtain document-term matrix
tfidf = vectorizer.fit_transform(entity_1)

## Nearest Neighbors algorithm to fit TF-IDF matrix

In [7]:
# Create the kNN-model and learn from TF-IDF matrix
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
unique_org = df_train['entity_2'].values.astype('U').tolist()

# We will use the Eucledian distance (which is default) of two TF-IDF matrices.
def getNearestN(query):
    # Returns a number document-term matrix
    queryTFIDF_ = vectorizer.transform(query) 
    distances, _ = nbrs.kneighbors(queryTFIDF_)
    return distances, _

start_time = time.time()
distances, _ = getNearestN(unique_org)
t = time.time() - start_time
print("Completed in:", t)

Completed in: 142.12506818771362


In [8]:
final_match = df_train[['entity_1']]
final_match.insert(1, 'entity_2', unique_org, True)
final_match.insert(2, 'Distance', distances, True)
final_match['Distance'] = final_match['Distance'].apply(lambda col: round(col,2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_match['Distance'] = final_match['Distance'].apply(lambda col: round(col,2))


In [9]:
final_match[['entity_1','entity_2']].equals(df_train[['entity_1','entity_2']])

True

## Applied sklearn's MinMaxScaler to scale the distances between the range of 0 and 1

In [10]:
#First scaled the distance by using min max scaler
x = final_match['Distance'].values.reshape(-1,1)
min_max_scaler = MinMaxScaler()
scaled_conf = pd.Series(min_max_scaler.fit_transform(x).reshape(-1))
final_match.insert(3, "Scaled_Distance", scaled_conf, True)
final_match['Scaled_Distance'] = final_match['Scaled_Distance'].apply(lambda col: round(col,2))

## Since the distances and similarity probabilities are inversely proportional, I obtained the probabilities by: 1 - scaled_distance

In [11]:
proba = 1 - final_match['Scaled_Distance']
final_match.insert(4, "Probability", proba, True)

In [12]:
tag = df_train['tag']
final_match.insert(5, 'Tag', tag, True)

In [13]:
predicted = []
for i in range(df_train.shape[0]):
    if final_match['Probability'].iloc[i] >= 0.55:
        predicted.append(1)
    else:
        predicted.append(0)

In [14]:
final_match.insert(6, 'Predicted', predicted, True)

In [15]:
final_match

Unnamed: 0,entity_1,entity_2,Distance,Scaled_Distance,Probability,Tag,Predicted
0,scherer automotive incorporated.,elektroshchit A.O,0.81,0.62,0.38,0,0
1,technical supplies international llc,c&t reinforcing steel,0.79,0.61,0.39,0,0
2,Campania Motori SRL,campania motori,0.45,0.35,0.65,1,1
3,rockford manufacturing group,Rockford Manufacturing Group Inc,0.25,0.19,0.81,1,1
4,organik kimya netherlands,pro-line shipping )),0.72,0.55,0.45,0,0
...,...,...,...,...,...,...,...
99995,hazle auto parts,ferlog nsorzio stabile,1.05,0.81,0.19,0,0
99996,schlueter a.i.r Co.nditioning and refrigeration,Cherubini France,1.05,0.81,0.19,0,0
99997,shanghai rongtai health,ineos oxide LTD,0.95,0.73,0.27,0,0
99998,international shipping lines,k c discounts,0.92,0.71,0.29,0,0


## Sub-dataframe to see the output probabilities clearly

In [16]:
final_match[['entity_1','entity_2','Probability']]

Unnamed: 0,entity_1,entity_2,Probability
0,scherer automotive incorporated.,elektroshchit A.O,0.38
1,technical supplies international llc,c&t reinforcing steel,0.39
2,Campania Motori SRL,campania motori,0.65
3,rockford manufacturing group,Rockford Manufacturing Group Inc,0.81
4,organik kimya netherlands,pro-line shipping )),0.45
...,...,...,...
99995,hazle auto parts,ferlog nsorzio stabile,0.19
99996,schlueter a.i.r Co.nditioning and refrigeration,Cherubini France,0.19
99997,shanghai rongtai health,ineos oxide LTD,0.27
99998,international shipping lines,k c discounts,0.29


## Evaluation

I obtained a classification report and a confusion matrix to obtain the F-1 score of the model, which is a good metric for the model.

In [17]:
print(classification_report(final_match['Tag'], final_match['Predicted']))

              precision    recall  f1-score   support

           0       0.91      0.85      0.88     58919
           1       0.80      0.88      0.84     41081

    accuracy                           0.86    100000
   macro avg       0.86      0.86      0.86    100000
weighted avg       0.87      0.86      0.86    100000



In [18]:
print(confusion_matrix(final_match['Tag'], final_match['Predicted']))

[[49850  9069]
 [ 4807 36274]]


## Suggestions for Production

Unlike this work which I trained on 100K samples, the corpus could be trained on a larger sample with a greater GPU capacity. Model could be further developed also by hyper-parameter tuning after a longer training. Model that is trained with a large corpus could be saved and new entities could be scored.

In [2]:
import pandas as pd
data = pd.read_csv('ds_challenge_alpas.csv')
data

Unnamed: 0.1,Unnamed: 0,entity_1,entity_2,tag
0,3137667,preciform A.B,Preciform AB,1
1,5515816,degener staplertechnik vertriebs-gmbh,Irshim,0
2,215797,Alltel South CaroliNA Inc,alltel south carolina INC.,1
3,1004621,cse Corporation,Cse Corp,1
4,1698689,Gruppo D Motors Srl,gruppo d motors Sociedad de Resposabilidad Lim...,1
...,...,...,...,...
7042841,5488853,danbury fair hyundai,alfredo's foreign cars,0
7042842,3885538,T B I T Tecnologia & Sistemas SA,t b i t tecnologia +,1
7042843,3803061,stereographics Corp),Stereographics Corp,1
7042844,3594810,Shanghai Mingdou Chemical CoLTD,shanghai mingdou chemical,1


In [4]:
data = data.sample(5000).reset_index(drop=True)
data.to_csv('ds_challgenge_5000.csv', index=False)