# Notebook 6 - process_predictor_function breakdown
This notebook contains a step by step breakdown of the process_predict_function.  Imports for the function are present in this notebook, but have been moved to the bottom for ease of understanding the process.

In [None]:
#pip installations - necessary to get notebook to run
#update dask
!pip install --upgrade pip
!pip install dask==2.4.8
!pip install fsspec
!pip install --upgrade s3fs
!pip install numpy
!pip install pymystem3
!pip install spacy
!pip install joblib
!pip install pymorphy2==0.8
!pip install dask_ml

In [None]:
# IMPORTS

# dataframe
import dask.dataframe as dd
import pandas as pd

# DESCRIPTION_GOOD preprocessing
import nltk
nltk.download("stopwords")
#--------#
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation


# machine learning/analysis
import dask_ml.cluster as dask_ml_model # sklearn's skmeans took up too much memory to run.

# measuring euclidian distance
from scipy.spatial.distance import euclidean, pdist

# S3 bucket interaction
import tempfile
import boto3
import joblib

# Disable warning message related to SettingWithCopyWarning
# displays when running final function otherwise
pd.options.mode.chained_assignment = None     # default = 'warn'

In [None]:
# define stemmer and Russian stopwords for data preprocessing
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")
# https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
# add trade-specific stopwords to list
newStopWords = ['г', '№', '10', '1', '20', '30', 'кг', '5', 'см',
                '100', '80', '2', 'х', 'l', 'м', '00', '000'
                '1.27', '2011.10631', '4', '12', '3', 'фр', 'количество',
                'становиться', 'мм', 'вид', 'упаковка', 'получать',
                'прочий', 'использование', 'масса', 'размер', 'черный',
                '6', '8', '7', '50', '40', '25', 'коробка', 'поддон',
                'вдоль', '250', '65', '85', '15', '35', '40', '45',
                '55', '60', '70', '75', 'м3', '13', '0', '14',
                '16', '18', 'm2', 'п', 'р', 'т', 'тип', 'являться',
                'размер', 'cm', 'm', '01', '02', '03', '04', '05',
                '06', '07', '08', '09', '24', '27']
russian_stopwords.extend(newStopWords)

#define function for preprocessing text - to be used later in notebook
#function will remove Russian stop words and any punctuation not removed in cleaning_trade_data_desc_kmeans.ipynb
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
        and token != " " \
        and token.strip() not in punctuation]
    text = " ".join(tokens)
    return text

# similarity function for euclidian measure at end of main function
def similarity_func(u, v):
    return 1/(1+euclidean(u,v))

In [4]:
#load model, vectorizer, and tokenizer to notebook
s3 = boto3.resource('s3')
bucket=s3.Bucket('labs20-arms-bucket')

# load vectorizer from S3 bucket
key = "vectorizerf.pkl"
with tempfile.TemporaryFile() as fp:
    bucket.download_fileobj(Fileobj=fp, Key=key)
    fp.seek(0)
    vectorizer = joblib.load(fp)

# load model from S3 bucket
key = "modelf.pkl"
with tempfile.TemporaryFile() as fp:
    bucket.download_fileobj(Fileobj=fp, Key=key)
    fp.seek(0)
    model = joblib.load(fp)

#load cluster dataset from S3 bucket
# drop error column accidentally created in import
clusters = pd.read_csv('s3://labs20-arms-bucket/data/armsclustersf.csv')
clusters = clusters.drop([clusters.columns[0]], axis='columns')

# list of known arms exporters
inn_arms_exp_total = ['7718852163',  '7740000090',    '7731084175',  '6161021690',
                      '3807002509',  '6672315362',    '7802375335',  '7813132895',  
                      '7731280660',  '7303026762',    '5040007594',  '2501002394',  
                      '7807343496',  '7731559044',    '5042126251',  '7731595540',    
                      '7733018650',  '7722016820',    '7705654132',  '7714336520',    
                      '7801074335',  '6229031754',    '7830002462',  '6825000757',  
                      '5043000212',  '7802375889',    '5010031470',  '1660249187',  
                      '7720015691',  '6154573235',    '5038087144',  '7713006304',  
                      '7805326230',  '5023002050',    '4007017378',  '7714013456',  
                      '17718852163', '7811406004',    '7702077840',  '7839395419',  
                      '7702244226',  '7704721192',    '7731644035',  '7712040285',
                      '7811144648',  '4345047310',    '7720066255',  '6607000556',
                      '1832090230',  '1835011597',    '3305004083',  '4340000830',
                      '5074051432',  '1841015504',    '7105008338',  '7106002829', 
                      '7704274402',  '5942400228',    '7105514574',  '5012039795', 
                      '7714733528',  '3904065550',    '6825000757',  '7807343496', 
                      '7731559044',  '7805231691',    '7704859803',  '0273008320',
                      '7704274402',  '2902059091',    '7805034277',  '7727692011',
                      '7733759899',  '6154028021',    '7328032711',  '2635002815',
                      '5040097816',  '5027033274',    '5250018433',  '5200000046',
                      '7743813961',  '7718016666',    '5047118550',  '7704274402']

In [78]:
### TEST ###
#read df_trade_desc_processed_testIF2 for function test
df = pd.read_csv('s3://labs20-arms-bucket/data/df_trade_desc_processed_testIF2.csv',dtype={'CONSIGNOR_INN': 'str'})

In [79]:
# reducing for walkthrough purposes, less CPU required
df = df.head(75000)

# Beginning of Breakdown

In [84]:
# function variable defaults
name_column = 'CONSIGNOR_NAME'
id_column = 'CONSIGNOR_INN'
text_column = 'DESCRIPTION_GOOD'
invalid_id_terms = ['None', '00', 'ИНН/КПП НЕ О', '0']
min_trades=40
profile_similarity_threshold = .75
cluster_columns = ['clust0', 'clust1', 'clust6']

In [85]:
# set column variable
# reduce dataframe so that dataframe only contains columns in columns variable
df = df[[name_column, id_column, text_column]]
df.head()

Unnamed: 0,CONSIGNOR_NAME,CONSIGNOR_INN,DESCRIPTION_GOOD
0,ООО МАГНАТ,3808198484,ЛЕСОМАТЕРИАЛЫРАСПИЛЕННЫЕ ВДОЛЬ Х/П ЛИСТВЕННИЦА...
1,ООО ТК ВЕСТА,2465310231,ПИЛОМАТЕРИАЛЫ Х/П ЕЛЬ СИБИРСКАЯ PICEA OBOVATA ...
2,ООО ТЕХНОНИКОЛЬ - СТРОИТЕЛЬНЫЕ СИСТЕМЫ,7702521529,ТЕПЛОИЗОЛЯЦИОННЫЕ ПЛИТЫ ПОРИСТЫЕ ИЗ ЭКСТРУЗИОН...
3,ЗАО ЭНЕРГОСТРОЙМОНТАЖ,7813112708,КЛАПАНЫ ОБРАТНЫЕ ПОВОРОТНЫЕ ОДНОДИСКОВЫЕ ИЗГОТ...
4,ООО КУПИШУЗ,7705935687,РУБАШКА МУЖСКАЯ ШЕРСТЯНАЯ ТРИКОТАЖНАЯ НЕ КЛАСС...


In [86]:
df.shape

(75000, 3)

In [87]:
df['CONSIGNOR_INN'].nunique()

9691

In [88]:
# remove rows from dataset containing INNs of known arms exporters
# check 'INN' column against inn_arms_exp_total list, drop row if there's a match with the list
df = df[~df[id_column].isin(inn_arms_exp_total)]
df.shape

(74303, 3)

In [89]:
df['CONSIGNOR_INN'].nunique()

9662

In [90]:
# clean INNs
# Create subslice of dataframe for dictionary
dict_df = df[[name_column, id_column]]
dict_df.head()

Unnamed: 0,CONSIGNOR_NAME,CONSIGNOR_INN
0,ООО МАГНАТ,3808198484
1,ООО ТК ВЕСТА,2465310231
2,ООО ТЕХНОНИКОЛЬ - СТРОИТЕЛЬНЫЕ СИСТЕМЫ,7702521529
3,ЗАО ЭНЕРГОСТРОЙМОНТАЖ,7813112708
4,ООО КУПИШУЗ,7705935687


In [91]:
dict_df.shape

(74303, 2)

### All invalid terms have already been removed from this dict
in practice, this action will limit the size of the dictionary

In [92]:
# clean columns of dict_df, remove invalid_id_terms from CONSIGNOR_INN column
invalid_id_terms = invalid_id_terms
for term in invalid_id_terms:
    dict_df = dict_df[dict_df[id_column] != term]

dict_df.shape

(74303, 2)

In [93]:
# drop all null values
dict_df.dropna(inplace=True)
# sort values by 'CONSIGNOR_NAME'
dict_df.sort_values(name_column, inplace = True) 
# dropping ALL duplicte 'CONSIGNOR_NAME' values from dictionary
dict_df.drop_duplicates(subset =name_column, keep = 'first', inplace = True)

dict_df.shape

(10992, 2)

In [94]:
# create list of 2-item lists: [CONSIGNOR_NAME, CONSIGNOR_INN]
new_list = dict_df.values.tolist()
new_list[:2]

[['000ТОРГОВЫЙ ДОМ СТРАЖ-НЕВА', '7804354122'],
 ['025374 27 05 ИВАНОВ НИКОЛАЙ АЛЕКСЕЕВИЧ', '391400675301']]

In [95]:
# create dictionary out of list of lists
# for every list in the list of lists, take the first item in list (CONSIGNOR_NAME)
# and add it to index position of dictionary, take second term ('CONSIGNOR_INN') and add it to value position of dictionary
# cannot use pandas.to_dict() because it adds column names to dictionary; only want indexes/values
new_dict = {t[0]:t[1] for t in new_list}
next(iter(new_dict.items()))

('000ТОРГОВЫЙ ДОМ СТРАЖ-НЕВА', '7804354122')

In [96]:
# map new_dict to 'CONSIGNOR_INN' column of main dataframe
df[id_column] = df[name_column].map(new_dict)
df.nunique()

CONSIGNOR_NAME      10992
CONSIGNOR_INN        9662
DESCRIPTION_GOOD    58572
dtype: int64

In [97]:
df['CONSIGNOR_INN'].nunique()

9662

In [98]:
# drop null values
df.dropna(inplace=True)
df.shape

(74303, 3)

In [99]:
# remove all rows from list whose total INN count is less than min_trades variable
# way to limit size before processing, weed out INNs that only have a few trades present in dataset
df = df[df.groupby(id_column)[id_column].transform('size') >= min_trades]
df.shape

(39142, 3)

In [103]:
# make sure min_trades threshold enforced
# check value counts for lowest counts in data, confirm 35
df['CONSIGNOR_INN'].value_counts()[-20:]

2460251293      37
6149019455      37
3849026861      37
2710000425      37
381700387428    36
3816005739      36
2311222673      36
3808156861      36
0917029272      36
6658320970      36
7736622821      36
2704011651      35
7116000066      35
6345002063      35
9909136710      35
3508001289      35
3906331977      35
2465310231      35
6227009062      35
7733646084      35
Name: CONSIGNOR_INN, dtype: int64

In [38]:
#create list for preprocessed text to be appended to
processed_text_list = []
        
#this is the alg to apply preprocessing function to text column
# removed print statement from David's function
for i in range(len(df[text_column])):
    x = df[text_column].iloc[i]
    if isinstance(x, str):
        processed_text_list.append(preprocess_text(x))
    else:
        processed_text_list.append(preprocess_text(x.astype(str)))

In [39]:
# convert list of preprocessed text to dataframe
# to be concatenated onto original dataframe
df1 = pd.DataFrame({'PREPROCESSED_TEXT':processed_text_list})
df1.head()

Unnamed: 0,PREPROCESSED_TEXT
0,теплоизоляционный плита пористый экструзионный...
1,рубашка мужской шерстяной трикотажный класс лю...
2,пиловочник неокоренный хвойный порода бревно ч...
3,вентилятор осевой охлаждение техника гражда пр...
4,верхний одежда трикотажный хлопчатобумажный кл...


In [40]:
# compared to nonpreprocessed text
df['DESCRIPTION_GOOD'].head()

2     ТЕПЛОИЗОЛЯЦИОННЫЕ ПЛИТЫ ПОРИСТЫЕ ИЗ ЭКСТРУЗИОН...
4     РУБАШКА МУЖСКАЯ ШЕРСТЯНАЯ ТРИКОТАЖНАЯ НЕ КЛАСС...
9     ПИЛОВОЧНИК НЕОКОРЕННЫЙ ХВОЙНЫХ ПОРОД БРЕВНА БЕ...
10    ВЕНТИЛЯТОР ОСЕВОЙ ДЛЯ ОХЛАЖДЕНИЯ ТЕХНИКИ ГРАЖД...
12    ВЕРХНЯЯ ОДЕЖДА ТРИКОТАЖНАЯ ИЗ ХЛОПЧАТОБУМАЖНОЙ...
Name: DESCRIPTION_GOOD, dtype: object

In [41]:
# reset indices of both dataframes
df1 = df1.reset_index()
df = df.reset_index()
df['index'] = df.index
        
# merge preprocessed text to original dataframe
df_merge = pd.concat([df, df1], axis=1, join='inner')
df_merge.head()

Unnamed: 0,index,CONSIGNOR_NAME,CONSIGNOR_INN,DESCRIPTION_GOOD,index.1,PREPROCESSED_TEXT
0,0,ООО ТЕХНОНИКОЛЬ - СТРОИТЕЛЬНЫЕ СИСТЕМЫ,7702521529,ТЕПЛОИЗОЛЯЦИОННЫЕ ПЛИТЫ ПОРИСТЫЕ ИЗ ЭКСТРУЗИОН...,0,теплоизоляционный плита пористый экструзионный...
1,1,ООО КУПИШУЗ,7705935687,РУБАШКА МУЖСКАЯ ШЕРСТЯНАЯ ТРИКОТАЖНАЯ НЕ КЛАСС...,1,рубашка мужской шерстяной трикотажный класс лю...
2,2,АО ГРУППА ИЛИМ,7840346335,ПИЛОВОЧНИК НЕОКОРЕННЫЙ ХВОЙНЫХ ПОРОД БРЕВНА БЕ...,2,пиловочник неокоренный хвойный порода бревно ч...
3,3,ООО САМСУНГ ЭЛЕКТРОНИКС РУС КОМПАНИ,7703608910,ВЕНТИЛЯТОР ОСЕВОЙ ДЛЯ ОХЛАЖДЕНИЯ ТЕХНИКИ ГРАЖД...,3,вентилятор осевой охлаждение техника гражда пр...
4,4,ООО КУПИШУЗ,7705935687,ВЕРХНЯЯ ОДЕЖДА ТРИКОТАЖНАЯ ИЗ ХЛОПЧАТОБУМАЖНОЙ...,4,верхний одежда трикотажный хлопчатобумажный кл...


In [42]:
# drop DESCRIPTION_GOOD column, no longer necessary now that PROCESSED_TEXT column is present
df_merge = df_merge.drop([text_column, 'index'], axis='columns')
df_merge.head()

Unnamed: 0,CONSIGNOR_NAME,CONSIGNOR_INN,PREPROCESSED_TEXT
0,ООО ТЕХНОНИКОЛЬ - СТРОИТЕЛЬНЫЕ СИСТЕМЫ,7702521529,теплоизоляционный плита пористый экструзионный...
1,ООО КУПИШУЗ,7705935687,рубашка мужской шерстяной трикотажный класс лю...
2,АО ГРУППА ИЛИМ,7840346335,пиловочник неокоренный хвойный порода бревно ч...
3,ООО САМСУНГ ЭЛЕКТРОНИКС РУС КОМПАНИ,7703608910,вентилятор осевой охлаждение техника гражда пр...
4,ООО КУПИШУЗ,7705935687,верхний одежда трикотажный хлопчатобумажный кл...


In [43]:
#define variable to feed to TFIDF Vectorizer - 'PROCESSED_TEXT' column of train dataset
text = df_merge['PREPROCESSED_TEXT']
text[:2]

0    теплоизоляционный плита пористый экструзионный...
1    рубашка мужской шерстяной трикотажный класс лю...
Name: PREPROCESSED_TEXT, dtype: object

In [44]:
#transform text with vectorizer
#Converted to Unicode because it will run into an np.nan error. This need to be turned into a unicode string.
sparse = vectorizer.transform(text.values.astype('U'))

In [45]:
sparse

<38827x301 sparse matrix of type '<class 'numpy.float64'>'
	with 255486 stored elements in Compressed Sparse Row format>

In [46]:
# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(sparse.todense(), columns=vectorizer.get_feature_names())
dtm.head()

Unnamed: 0,00,10,11,27,848686,88104см,90,946288,946388,abies,...,черновой,швейный,шина,шип,шлифовать,шт,электрический,элемент,этиловый,этиловый спирт
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.227187,0.0,...,0.233924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# reset indices of both dataframes for merge
# not sure why we had to do this, but running the following three commands gave us the results we wanted
dtm = dtm.reset_index()
df_merge = df_merge.reset_index()
df_merge['index'] = df_merge.index
dtm['index'] = dtm.index

In [48]:
# merge vectorized word feature matrix with training dataset
df_merge_vector = pd.concat([df_merge, dtm], axis=1, join='inner')
df_merge_vector.head()

Unnamed: 0,index,CONSIGNOR_NAME,CONSIGNOR_INN,PREPROCESSED_TEXT,index.1,00,10,11,27,848686,...,черновой,швейный,шина,шип,шлифовать,шт,электрический,элемент,этиловый,этиловый спирт
0,0,ООО ТЕХНОНИКОЛЬ - СТРОИТЕЛЬНЫЕ СИСТЕМЫ,7702521529,теплоизоляционный плита пористый экструзионный...,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,ООО КУПИШУЗ,7705935687,рубашка мужской шерстяной трикотажный класс лю...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,АО ГРУППА ИЛИМ,7840346335,пиловочник неокоренный хвойный порода бревно ч...,2,0.0,0.0,0.0,0.0,0.0,...,0.233924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,ООО САМСУНГ ЭЛЕКТРОНИКС РУС КОМПАНИ,7703608910,вентилятор осевой охлаждение техника гражда пр...,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,ООО КУПИШУЗ,7705935687,верхний одежда трикотажный хлопчатобумажный кл...,4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# drop index columns
df_merge_vector = df_merge_vector.drop(columns=['index'])
df_merge_vector.head()

Unnamed: 0,CONSIGNOR_NAME,CONSIGNOR_INN,PREPROCESSED_TEXT,00,10,11,27,848686,88104см,90,...,черновой,швейный,шина,шип,шлифовать,шт,электрический,элемент,этиловый,этиловый спирт
0,ООО ТЕХНОНИКОЛЬ - СТРОИТЕЛЬНЫЕ СИСТЕМЫ,7702521529,теплоизоляционный плита пористый экструзионный...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ООО КУПИШУЗ,7705935687,рубашка мужской шерстяной трикотажный класс лю...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,АО ГРУППА ИЛИМ,7840346335,пиловочник неокоренный хвойный порода бревно ч...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.233924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ООО САМСУНГ ЭЛЕКТРОНИКС РУС КОМПАНИ,7703608910,вентилятор осевой охлаждение техника гражда пр...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ООО КУПИШУЗ,7705935687,верхний одежда трикотажный хлопчатобумажный кл...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# variable manipulation to feed into KMeans model
# pull create variable containing dataframe of vectorized words only, all rows, columns indexed 4 and onward
X = df_merge_vector.drop(columns=[name_column, id_column, 'PREPROCESSED_TEXT'])
X.head()

Unnamed: 0,00,10,11,27,848686,88104см,90,946288,946388,abies,...,черновой,швейный,шина,шип,шлифовать,шт,электрический,элемент,этиловый,этиловый спирт
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.227187,0.0,...,0.233924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# convert X dataframe into array
# necessary to feed to KMeans model
X_array = X.values
X_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
# fit model on vectorized word array
labels = model.predict(X_array)

In [54]:
# create 'cluster' column to add to vectorized dataframe
#Glue back to originaal data
df_merge_vector['cluster'] = labels
# check last column
df_merge_vector.head()

Unnamed: 0,CONSIGNOR_NAME,CONSIGNOR_INN,PREPROCESSED_TEXT,00,10,11,27,848686,88104см,90,...,швейный,шина,шип,шлифовать,шт,электрический,элемент,этиловый,этиловый спирт,cluster
0,ООО ТЕХНОНИКОЛЬ - СТРОИТЕЛЬНЫЕ СИСТЕМЫ,7702521529,теплоизоляционный плита пористый экструзионный...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ООО КУПИШУЗ,7705935687,рубашка мужской шерстяной трикотажный класс лю...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,АО ГРУППА ИЛИМ,7840346335,пиловочник неокоренный хвойный порода бревно ч...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ООО САМСУНГ ЭЛЕКТРОНИКС РУС КОМПАНИ,7703608910,вентилятор осевой охлаждение техника гражда пр...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,ООО КУПИШУЗ,7705935687,верхний одежда трикотажный хлопчатобумажный кл...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [55]:
# extract columns for final analysis
Y = df_merge_vector[[id_column,'cluster']]
Y.head()

Unnamed: 0,CONSIGNOR_INN,cluster
0,7702521529,1
1,7705935687,4
2,7840346335,0
3,7703608910,0
4,7705935687,4


In [56]:
# map cluster column across columns, ine for each cluster
for i in range(model.n_clusters):
    Y['clust{}'.format(i)] = Y['cluster']

Y.head()

Unnamed: 0,CONSIGNOR_INN,cluster,clust0,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9
0,7702521529,1,1,1,1,1,1,1,1,1,1,1
1,7705935687,4,4,4,4,4,4,4,4,4,4,4
2,7840346335,0,0,0,0,0,0,0,0,0,0,0
3,7703608910,0,0,0,0,0,0,0,0,0,0,0
4,7705935687,4,4,4,4,4,4,4,4,4,4,4


In [57]:
# convert to true/false if number matches cluster number
for i in range(model.n_clusters):
    Y['clust{}'.format(i)] = (Y['clust{}'.format(i)] == i) * 1
    
Y.head()

Unnamed: 0,CONSIGNOR_INN,cluster,clust0,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9
0,7702521529,1,0,1,0,0,0,0,0,0,0,0
1,7705935687,4,0,0,0,0,1,0,0,0,0,0
2,7840346335,0,1,0,0,0,0,0,0,0,0,0
3,7703608910,0,1,0,0,0,0,0,0,0,0,0
4,7705935687,4,0,0,0,0,1,0,0,0,0,0


In [58]:
# drop 'cluster' column, no longer necessary now that we have total trades per cluster per INN
Y = Y.drop(columns=['cluster'])
Y.head()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9
0,7702521529,0,1,0,0,0,0,0,0,0,0
1,7705935687,0,0,0,0,1,0,0,0,0,0
2,7840346335,1,0,0,0,0,0,0,0,0,0
3,7703608910,1,0,0,0,0,0,0,0,0,0
4,7705935687,0,0,0,0,1,0,0,0,0,0


In [59]:
#create column_names variable to filter out CONSIGNER_INN from .groupby() in next step
column_names = Y.drop(columns = [id_column]).columns.tolist()
column_names

['clust0',
 'clust1',
 'clust2',
 'clust3',
 'clust4',
 'clust5',
 'clust6',
 'clust7',
 'clust8',
 'clust9']

In [60]:
#create new dataframe totalling trades per cluster per INN
Y = pd.DataFrame(Y.groupby([Y[id_column]])[column_names].sum()).reset_index()
Y.head()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9
0,4629280000,2,1,0,0,1,0,1,0,0,38
1,9536290000,0,0,0,0,0,0,2,0,0,39
2,20342000000,3,1,0,0,0,0,3,0,0,77
3,266033300,25,0,16,0,0,0,10,0,0,0
4,266048970,6,0,84,0,0,0,1,0,0,0


In [61]:
# add final tally for known arms exporters
# reset index so known arms exporters are at bottom of dataframe, indexed properly
Y = Y.append(clusters.iloc[0,1:], sort=None).reset_index().drop(columns=['index'])
Y.tail()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9
325,8905039538.0,54,0,22,0,0,0,0,0,69,0
326,9705083098.0,0,0,82,0,0,0,0,0,0,0
327,9909012867.0,40,6,0,0,0,0,6,7,0,0
328,9909071164.0,46,2,3,0,0,0,4,3,0,0
329,,55151,9961,81,42,13,23,8723,1356,348,0


In [62]:
# convert all columns except for 'CONSIGNOR_INN' to decimals/percentages of total
Y[column_names] = Y[column_names].div(Y[column_names].sum(axis=1), axis=0)
Y.tail()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9
325,8905039538.0,0.372414,0.0,0.151724,0.0,0.0,0.0,0.0,0.0,0.475862,0.0
326,9705083098.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
327,9909012867.0,0.677966,0.101695,0.0,0.0,0.0,0.0,0.101695,0.118644,0.0,0.0
328,9909071164.0,0.793103,0.034483,0.051724,0.0,0.0,0.0,0.068966,0.051724,0.0,0.0
329,,0.728566,0.131589,0.00107,0.000555,0.000172,0.000304,0.115234,0.017913,0.004597,0.0


In [64]:
# known arms exporters found in clust0, clust1, clust6
# do not want other columns influencing final prediction; skews similarity scores upward
cluster_columns = cluster_columns
cluster_columns

['clust0', 'clust1', 'clust6']

In [65]:
cluster_columns.insert(0, id_column)
cluster_columns

['CONSIGNOR_INN', 'clust0', 'clust1', 'clust6']

In [66]:
Y = Y[cluster_columns]
Y.head()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust6
0,4629280000,0.046512,0.023256,0.023256
1,9536290000,0.0,0.0,0.04878
2,20342000000,0.035714,0.011905,0.035714
3,266033300,0.490196,0.0,0.196078
4,266048970,0.065934,0.0,0.010989


In [67]:
# similarity matrix - create list of p-distance scores using pdistance & euclidian distance
# simply put, it measures how similar two sets if numbers are
# https://stackoverflow.com/questions/35758612/most-efficient-way-to-construct-similarity-matrix
# each row in dataframe will be compared against the bottom row of the dataframe, which contains the totals for knowns arms exporters
pscores=[]
for i in range(len(Y)):
    x = pdist([Y.iloc[-1, 1:],Y.iloc[i, 1:]], similarity_func)[0]
    pscores.append(x)

In [68]:
# add pdist_score column to Y dataframe
# pdist_score is obviously 100% for known arms exporters, comparing similarity to itself
Y['pdist_score'] = pscores
Y.tail()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust6,pdist_score
325,8905039538.0,0.372414,0.0,0.0,0.715929
326,9705083098.0,0.0,0.0,0.0,0.571668
327,9909012867.0,0.677966,0.101695,0.101695,0.94312
328,9909071164.0,0.793103,0.034483,0.068966,0.888541
329,,0.728566,0.131589,0.115234,1.0


In [69]:
# drop control row (known arms exporters totals)
Y = Y.drop(Y.index[-1])
Y.tail()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust6,pdist_score
324,7840346335,0.829082,0.056122,0.012755,0.860456
325,8905039538,0.372414,0.0,0.0,0.715929
326,9705083098,0.0,0.0,0.0,0.571668
327,9909012867,0.677966,0.101695,0.101695,0.94312
328,9909071164,0.793103,0.034483,0.068966,0.888541


In [70]:
# create profile_similarity_threshold variable
# if INN's pdist_score >= profile_similarity_threshold, INN will be included in final dataframe
# if INN's pdist_score < profile_similarity_threshold, INN will not be included in final dataframe
Y = Y[Y['pdist_score'] >= profile_similarity_threshold]
Y.head()

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust6,pdist_score
3,266033300,0.490196,0.0,0.196078,0.778799
5,268004714,0.661017,0.338983,0.0,0.802126
6,268008010,0.833333,0.0,0.166667,0.85042
7,32400023421,0.410256,0.282051,0.076923,0.738466
8,32500432033,0.453744,0.321586,0.07489,0.748205


In [74]:
Y.sort_values(by='pdist_score', ascending=True).tail(10)

Unnamed: 0,CONSIGNOR_INN,clust0,clust1,clust6,pdist_score
130,5032136476,0.707692,0.046154,0.184615,0.899264
152,5260900010,0.788321,0.167883,0.029197,0.900202
260,7703788638,0.745763,0.169492,0.016949,0.903559
56,2634079452,0.75,0.045455,0.159091,0.909914
287,7712040126,0.771208,0.056555,0.136247,0.918421
177,614500797306,0.772727,0.090909,0.060606,0.92492
150,5258130010,0.711111,0.066667,0.155556,0.927306
259,7703647595,0.763158,0.078947,0.105263,0.940049
327,9909012867,0.677966,0.101695,0.101695,0.94312
140,5075018950,0.732877,0.143836,0.10274,0.9823


In [71]:
Y.shape

(257, 5)

In [72]:
Y.nunique()

CONSIGNOR_INN    257
clust0           178
clust1           102
clust6            92
pdist_score      196
dtype: int64