#**Libraries we need**

In [None]:
from google.colab import files #to upload the dataset on colab
import io

import pandas as pd #to manage the dataset
import numpy as np #just for the encoding part (np.int32)

from sklearn.preprocessing import OneHotEncoder #to transform the dataset (in such a way that it is more digestible for the classifier)
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier #the classifier
from sklearn.model_selection import train_test_split, GridSearchCV #to finish the classifier
from sklearn.metrics import accuracy_score, classification_report #to test the classifier

#**Uploading the dataset**

In [None]:
df_name = 'candidates_co_founders.xlsx'
uploaded = files.upload()
df = pd.read_excel(io.BytesIO(uploaded[df_name]))

Saving candidates_co_founders.xlsx to candidates_co_founders.xlsx


In [None]:
df.head(5) #just to visualize the data

Unnamed: 0,temperament,skills,self_sufficiency,work_life_balance,lvl_collab_exp,emotional_buoyancy,emotional_lvl,confidence,personality_you_like,reppresentative_value,class
0,introverted,business,yes,3,3,4,3,4,visionary,confidence,like
1,introverted,programming,no,5,1,2,2,3,motivator,control,dislike
2,stable,sales,no,4,4,3,3,2,inspirational,tranquility,dislike
3,outgoing,research,no,5,5,4,3,4,artist,efficiency,dislike
4,stable,programming,yes,3,4,3,2,3,consolidator,indipendence,like


#**Preparing the dataset**

In [None]:
''' Divide the data from the labels '''

X = df.drop(['class'], axis = 1)
y = df['class']

In [None]:
''' Find the categorical features '''

transf_dtype = np.int32
categorical_features = X.dtypes.loc[df.dtypes =='object'].index.values

print(categorical_features)

['temperament' 'skills' 'self_sufficiency' 'personality_you_like'
 'reppresentative_value']


In [None]:
''' Define the Encoder '''

categorical_transformer = OneHotEncoder(handle_unknown='ignore',
                                        sparse = False,
                                        dtype = transf_dtype,
                                        )
transformer = ColumnTransformer(transformers=[('cat',categorical_transformer, categorical_features)],
                                remainder='passthrough')

In [None]:
''' Apply the Encoder'''

X_p = transformer.fit_transform(X)
print(y.shape)

(30,)


In [None]:
'''Enjoy the result'''

df_p = pd.DataFrame(X_p)
df_p.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,3,4,3,4
1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,5,1,2,2,3
2,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,4,4,3,3,2
3,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,5,5,4,3,4
4,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,3,4,3,2,3


In [None]:
'''Split the dataset'''

index = np.random.randint(0,X_p.shape[0]+1) #this will be useful later
rand_instance = X_p[index]
X2 = np.delete(X_p, index, axis=0)
y2 = y.drop(index=index, axis=0)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state = 42)
print(X_train.shape)

(21, 29)


#**Parameter tuning**

In [None]:
#list of parameters to be tested
tune_param_knn = [{'n_neighbors': list(range(1,10)),
                   'metric': ['euclidian', 'manhattan', 'chebyshev']}]

#search of the best parameters according to accuracy
clf = GridSearchCV(KNeighborsClassifier(), tune_param_knn,
                       cv = 5,
                       scoring = 'accuracy',
                       return_train_score = False,
                       n_jobs=2)

clf.fit(X_train, y_train)
print("K nearest Neighbors")
print("\nBest Parameters:", clf.best_params_) #which are the best param?

predictions = clf.predict(X_test)
print("\n")
print(classification_report(y_test, predictions)) #some additional measurements of classification skills

K nearest Neighbors

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5}


              precision    recall  f1-score   support

     dislike       0.50      1.00      0.67         2
        like       1.00      0.67      0.80         6

    accuracy                           0.75         8
   macro avg       0.75      0.83      0.73         8
weighted avg       0.88      0.75      0.77         8



#**Training and testing the classifier**

In [None]:
#we use knn since it is simple and not obvious as classifier
knn= KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'], metric=clf.best_params_['metric'])
knn.fit(X_train,y_train)

knn_train_predictions = knn.predict(X_train)
knn_train_accuracy_score = accuracy_score(knn_train_predictions, y_train)*100

knn_test_predictions = knn.predict(X_test)
knn_test_accuracy_score = accuracy_score(knn_test_predictions, y_test)*100

print("KNN Train accuracy:", round(knn_train_accuracy_score, 2),"%")
print("KNN Test accuracy:", round(knn_test_accuracy_score, 2),"%")

KNN Train accuracy: 90.48 %
KNN Test accuracy: 75.0 %


#**Updating the dataset**

In [None]:
new_instance = rand_instance.reshape((1,X_test.shape[1])) #we take a new profile to be proposed
print(knn.predict(new_instance))

['like']


In [None]:
'''Let's suppose that the feedback is negative. In this case what we do is to add the new entry in the dataset in order to have, the next time, a more precise result'''
X3 = np.vstack((X2, new_instance))
print(X3.shape)

(30, 29)


In [None]:
y3 = y2.copy()
y3.loc[-1]='dislike'

In [None]:
print(y3.shape)

(30,)
