In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer

%matplotlib inline

In [2]:
data_path = os.path.join('..', 'data')
input_filename = 'cleaned_ds.csv'

In [3]:
data = pd.read_csv(os.path.join(data_path, input_filename))

  interactivity=interactivity, compiler=compiler, result=result)


# Prepare the data
We can operate only on numeric features so all the columns that contain non numeric data types
must be exluded.

In [4]:
data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
data.head()

Unnamed: 0,id,organization_name,cb_rank_(company),number_of_founders,number_of_employees,number_of_funding_rounds,funding_status,last_funding_date,last_funding_amount_currency_(in_usd),last_funding_type,...,industries_group_14,industries_group_15,industries_group_16,n_city,n_country,vc_1,vc_2,vc_3,vc_4,vc_5
0,0,Jones Realty & Construction,1044008.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,1.0,0,0,0,0,0
1,1,Kumatronik,1044013.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
2,2,DOC Developments,1044017.0,0.0,8.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,1.0,0,0,0,0,0
3,3,Peaksmart,1044021.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,9,1.0,0,0,0,0,0
4,4,Reeky Studios,1044022.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0,0,0,0,0


In [5]:
numeric_features = data.select_dtypes(include=np.number).drop(columns='id')
numeric_features.head()

Unnamed: 0,cb_rank_(company),number_of_founders,number_of_employees,number_of_funding_rounds,funding_status,last_funding_amount_currency_(in_usd),last_funding_type,total_funding_amount_currency_(in_usd),estimated_revenue_range,operating_status,...,industries_group_14,industries_group_15,industries_group_16,n_city,n_country,vc_1,vc_2,vc_3,vc_4,vc_5
0,1044008.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,...,0.0,0.0,0.0,0,1.0,0,0,0,0,0
1,1044013.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
2,1044017.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,1,...,0.0,0.0,0.0,0,1.0,0,0,0,0,0
3,1044021.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,9,1.0,0,0,0,0,0
4,1044022.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,...,0.0,0.0,0.0,0,0.0,0,0,0,0,0


# Scale features
There are feauters that have very large values like cb_rank and features
that are usually very small like number_of_founders. If they're not transformed to the same scale than features with larger values
will become more important in the algorithm and it's unwanted. Otherwise we could just have delete small value features in the first place.

There are many ways to scale the features, we're going to use a standard scaller which is subtracting the mean from every value of a given feature and dividing it by its standard deviation ($f_i=\frac{f_i-\bar{x}}{s}$).

Other options (to try later):
1. Min-max scaler: it might be problematic with some features as it may result in relatively high values in columns like "number_of_employess" and relatively small in columns like "cb_rank", so effectively it would be the reverse of the original problem.
2. Logarithmic transformation: worth trying later.

In [6]:
scaled_features = StandardScaler().fit_transform(numeric_features)
print("Shape of the scaled features: ", scaled_features.shape)
print("First row of the scaled features: \n", scaled_features[0])

Shape of the scaled features:  (411467, 63)
First row of the scaled features: 
 [ 1.95614362 -0.65693171 -1.00710608 -0.52517028 -0.43883217 -0.03135911
 -0.5091063  -0.04617597 -0.65103946  0.14217953 -0.38333046 -0.37467808
  0.15703845 -0.01086806 -0.04657104 -0.49450212 -0.05886358 -0.28028349
 -0.5394926  -0.55436955 -0.37966568 -1.06947429 -0.23811637  0.28693072
 -0.64205063 -0.48300336  0.5868306  -0.20654907 -0.13495446 -0.08753467
 -0.05770729 -0.03679939 -0.02438801 -0.01574661 -0.0065416  -0.00490124
 -0.00372471 -0.00338577 -0.00245447 -0.00282897 -0.64205063 -0.48300336
  0.5868306  -0.20654907 -0.13495446 -0.08753467 -0.05770729 -0.03679939
 -0.02438801 -0.01574661 -0.0065416  -0.00490124 -0.00372471 -0.00338577
 -0.00245447 -0.00282897 -0.44007635 -0.56382626 -0.27844231 -0.21463957
 -0.18952615 -0.17029062 -0.1565148 ]


# Normalize feature vectors
To be able to analyse simialirities between vectors using cosine similarity we have to convert them to unit vectors (vectors of lenght 1).
Simalirty is given by the equation: $cos(\theta)=\frac{ A\cdotp B} {\|A\|\|B\|}$, so if the angle between two vectors is zero meaning that they're perfectly aligned the cosinus of that angle will be 1. When vectors are orthogonal their similarity is 0. When they point at the opposite directions (180 degrees) their similarity is -1 etc. Here's an interactive demo showing cosinus value of a given angle: https://www.mathsisfun.com/algebra/trig-interactive-unit-circle.html

Firstly we'll divde all the vectors by their $L_2$ norms ($\frac{A}{\|A\|}$). This way, when we later calculate a dot product between two vectors the result will be the cosine of the angle between them:  $cos(\theta)=A\cdotp B$ (this is the equation from the previous paragraph without the denominor as the vectors were already divided by their norms).  
A video explaining dot product: https://www.youtube.com/watch?v=LyGKycYT2v0&list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab&index=9

In [7]:
normalized_features = Normalizer().fit_transform(scaled_features)
print("Shape of the normalized features: ", normalized_features.shape)
print("First row of the normalized features: \n", normalized_features[0])

Shape of the normalized features:  (411467, 63)
First row of the normalized features: 
 [ 0.56549324 -0.18990959 -0.29114001 -0.15181924 -0.12686013 -0.00906547
 -0.14717537 -0.01334881 -0.18820623  0.04110208 -0.11081537 -0.10831409
  0.04539758 -0.0031418  -0.01346302 -0.14295351 -0.01701662 -0.08102596
 -0.15595962 -0.16026033 -0.10975593 -0.30916977 -0.06883605  0.08294758
 -0.18560768 -0.13962939  0.16964436 -0.05971039 -0.03901341 -0.02530502
 -0.01668235 -0.01063818 -0.00705023 -0.00455212 -0.00189108 -0.00141688
 -0.00107676 -0.00097878 -0.00070955 -0.00081781 -0.18560768 -0.13962939
  0.16964436 -0.05971039 -0.03901341 -0.02530502 -0.01668235 -0.01063818
 -0.00705023 -0.00455212 -0.00189108 -0.00141688 -0.00107676 -0.00097878
 -0.00070955 -0.00081781 -0.1272198  -0.16299413 -0.0804937  -0.06204924
 -0.05478931 -0.04922859 -0.0452462 ]


# Calculating similarities
In fact dot product is suitable for comparing only 2 vectors. We're going to use its extended version that allows comparing a vector with a whole matrix of vectors at once.

In [8]:
query = 'Kumatronik'
query_index = data[data['organization_name'] == query]['id'].values[0]
print(f"We're looking for companies similar to {query} which is in our DB at index {query_index}")

We're looking for companies similar to Kumatronik which is in our DB at index 1


In [9]:
query_vector = normalized_features[query_index]

In [10]:
# Calculates similarities between the query vector and all the feature vectors.
similarities = np.inner(query_vector, normalized_features)

In [11]:
# Sorts indices based on the similarity values (ascending) and then reverse
# so the most similar rows are at the top (descending).
sorted_indices = np.argsort(similarities)[::-1]

In [12]:
# Show the first 10 rows, naturally the query vector is the most similar
# to itself.
data.iloc[sorted_indices[:10]]

Unnamed: 0,id,organization_name,cb_rank_(company),number_of_founders,number_of_employees,number_of_funding_rounds,funding_status,last_funding_date,last_funding_amount_currency_(in_usd),last_funding_type,...,industries_group_14,industries_group_15,industries_group_16,n_city,n_country,vc_1,vc_2,vc_3,vc_4,vc_5
1,1,Kumatronik,1044013.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
148,148,Tritec GmbH,1045066.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
22374,23271,Best-Computer Support,1034554.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
182748,184542,Connectlounge,1047557.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
112988,113906,Netzwerk Beratung Informationssysteme Duisburg,1061969.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
34422,35322,Explainity,1019575.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
37112,38012,SNP Transformations Deutschland,1030655.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
113051,113969,Hermann Stetter,1062797.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
13442,13442,BASF - Global Industrial Coatings Business,1042075.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0
113142,114060,Sipnetworks.de,1063844.0,0.0,0.0,0.0,0.0,1700-01-01,0.0,0.0,...,0.0,0.0,0.0,0,5.0,0,0,0,0,0


# Explainability
To find out what feauters of two vectors are closest to each other we can simply calculate a difference between them.
Here's where the zero feature values may affect the results. Maybe we should only show features that are not zeros but it's risky.

In [13]:
query_vector = normalized_features[query_index]
some_other_vector = normalized_features[np.random.choice(sorted_indices)]

In [14]:
differences = np.abs(some_other_vector - query_vector)

In [15]:
sorted_differences_indices = np.argsort(differences)

In [16]:
most_similar = sorted_differences_indices[:5]
least_similar = sorted_differences_indices[-5:][::-1]

In [17]:
print('Top most and least similar features')
print('Similar:'.ljust(30), 'Different:')
for similar, not_similar in zip(most_similar, least_similar):
    print(numeric_features.columns[similar].ljust(30),
          numeric_features.columns[not_similar])

Top most and least similar features
Similar:                       Different:
industries_15                  industries_4
industries_group_15            industries_group_4
industries_group_16            builtwith_-_active_tech_count
industries_16                  cb_rank_(company)
industries_group_14            industries_group_3
