In [24]:
from glob import glob
from os import path
import pandas as pd
from functools import reduce
from sklearn.neighbors import NearestNeighbors

# Import data

List all files to explore

In [25]:
glob(path.join('DB/institutes','*.csv'))

['DB/institutes\\institutes_academiainstitute.csv',
 'DB/institutes\\institutes_basicservicesupplierinstitute.csv',
 'DB/institutes\\institutes_financialinstitute.csv',
 'DB/institutes\\institutes_institute.csv',
 'DB/institutes\\institutes_investorinstitute.csv',
 'DB/institutes\\institutes_technicalassistanceinstitute.csv']

Ingest all files in the selected directory

In [26]:
file_names = glob(path.join('DB/institutes','*.csv'))
dfs = [pd.read_csv(fn) for fn in file_names]
institutes_df = pd.concat(dfs)

#print(institutes_df.info())

In [27]:
#print(list(institutes_df.columns.values))

We will export the DataFrame to have a look using Excel. 

In [28]:
#institutes_df.to_csv('institutes_df.csv')

According to this dataset, some fielda are not relevant to find the nearest neighbors (matches).
This fields are:
- id: this field is an identification number.
- website_url: this field is a webpage.
- date_of_establishment: this field is a date of establishment **might not be relevant. Or is it?**
- address_id: this field looks like a running number. **Is it relevant?**
- mission: this field is an open text. We could apply NLP to identify topics. **Maybe in a version 2 of the tool.**
- vision: this field is an open text. We could apply NLP to identify topics. **Maybe in a version 2 of the tool.**
- avatar: this field is an image.

# Build DataFrame with relevant features

In [29]:
institutes_df_short = institutes_df.drop(['website_url','date_of_establishment','address_id','mission','vision','avatar'], axis=1)

Replace all blancs (white spaces) with an empty string [ ]

In [30]:
institutes_df_short = institutes_df_short.replace(r'^\s*$', np.nan, regex=True)
#print(institutes_df_short.head())

Replace all empty strings {} with NaN

Let's check in Excel...

In [31]:
#institutes_df_short.to_csv('institutes_df_short.csv')

In [32]:
#print(institutes_df_short)

# Split DataFrame with features of type "object"

Let's separate the columns containing dtype == object to prepare the list to a standard form. This step will not generate yet an object recognized as a string.
Now, clean the strings from inconsistant characters such as brackets, quotatios, curly brackets, etc.

In [33]:
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].replace(np.nan, "[]",regex=True)
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].apply(lambda s:s.str.replace('"', "",regex=True))
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].apply(lambda s:s.str.replace('{', "[",regex=True))
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].apply(lambda s:s.str.replace('}', "]",regex=True))
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].apply(lambda s:s.str.replace(', ', '","',regex=True))
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].apply(lambda s:s.str.replace(',', '","',regex=True))
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].apply(lambda s:s.str.replace('[', '"',regex=True))
institutes_df_short = institutes_df_short.loc[:,institutes_df_short.dtypes==object].apply(lambda s:s.str.replace(']', '"',regex=True))
#institutes_df_short.head(20)

After this first step, our lists are finally recognized as such by Pandas. Still, we can not use the standard functions, because they are not made for list applications.

In [34]:
dfs = []
for index, column in enumerate(institutes_df_short):
    col_names = np.arange(0, len(pd.DataFrame(institutes_df_short.iloc[:,index].apply(lambda x: x[0:].split(',')).apply(pd.Series)).columns)).astype(str)
    df = institutes_df_short[column].apply(lambda x: x[0:].split(',')).apply(pd.Series).replace('""', np.nan,regex=True).reset_index(drop=True)
    df.columns = ["{}{}".format(column, i) for i in col_names]
    dfs.append(df)
#dfs

In [35]:
# to merge the array of DataFrames into a single Dataframe

institutes_df_short_o = reduce(lambda x,y: pd.merge(x,y, left_index=True, right_index=True), dfs)
#institutes_df_short_o

# Split DataFrame with features not of type "object"

Now, let's separate the columns containing dtype != object to include to the dataframe with the lists created in the step before.

In [36]:
institutes_df_short_no = institutes_df.loc[:,institutes_df.dtypes!=object]
#institutes_df_short_no.info()

# One hot encoding 

To devide the categorical data into dummy variables we need to convert only the variables of type 'object'

In [37]:
# Make a list of column names for variables of the type object
object_variables = list(institutes_df_short_o.columns)
#print(object_variables)

In [38]:
Cat_X = pd.get_dummies(institutes_df_short_o, columns=object_variables)

Let's check in Excel...

In [39]:
#Cat_X.to_csv('Cat_X.csv')

# Mergeing categorical and non-categorical DataFrames

In [40]:
not_object_variables = list(institutes_df_short_no.columns)
#print(not_object_variables)

In [41]:
# Saving the column id in a DataFrame
id_X = institutes_df_short_no['id']

In [42]:
# Redefining the DataFrame with non categorical values removing the column id
nonCat_X = institutes_df_short_no.drop(['id', 'mixmarket_profile', 'current_financial_needs','object_id', 'address_id', 'content_type_id'], axis=1)
#nonCat_X.to_csv('nonCat_X.csv')

In [43]:
X = reduce(lambda x,y: pd.merge(x,y, left_index=True, right_index=True), [Cat_X,nonCat_X]).replace(np.nan, 0)
#X.to_csv('X.csv')
#X

# Nearest neighbors KNN

In [44]:
# Finding the K nearest neighbors, where number of neighbors is selected in n_neighbors

nbrs = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(X)
distances, indices = nbrs.kneighbors(X)

In [45]:
#indices.shape

In [47]:
np.savetxt("indices of closest neighbors.csv", indices, delimiter=",")
print(indices) # indices of closest neighbors

[[  5   1   2   0  12]
 [  5   1   2   0  12]
 [  5   1   2   0  12]
 [  3  73  15  96  88]
 [  4  28  22  10   1]
 [  5   1   2   0  12]
 [  8   7   6  11  40]
 [  8   7   6  11  40]
 [  8   7   6  11  40]
 [  9  52  39  81  44]
 [ 10  28  22  16   4]
 [  8   7   6  11  40]
 [ 12  13  14  17  32]
 [ 12  13  14  17  32]
 [ 12  13  14  17  32]
 [ 15  73  96   3  88]
 [ 16  10  22  28  13]
 [ 12  13  14  17  32]
 [ 23  18  20  19  60]
 [ 23  18  20  19  60]
 [ 23  18  20  19  60]
 [ 21  78  71  82  87]
 [ 22  10  28  16   4]
 [ 23  18  20  19  60]
 [ 29  25  26  24  36]
 [ 29  25  26  24  36]
 [ 29  25  26  24  36]
 [ 27  29  25  26  24]
 [ 28  10  22   4  16]
 [ 29  25  26  24  36]
 [ 32  35  31  30  34]
 [ 32  35  31  30  34]
 [ 32  35  31  30  34]
 [ 33  69  89 107  68]
 [ 32  35  31  30  34]
 [ 32  35  31  30  34]
 [ 36  37  38  40   7]
 [ 36  37  38  40   7]
 [ 36  37  38  40   7]
 [ 39  52  44   9  84]
 [ 36  37  38  40   7]
 [ 45  42  41  43  31]
 [ 45  42  41  43  31]
 [ 45  42  