# Project - Predict the wellbeing of shanghainese communities

# MACHINE LEARNING

In [11]:
import pandas as pd
import numpy as np

In [204]:
df_predict = pd.read_pickle("df_prediction.pkl")

In [205]:
df_predict.head()

Unnamed: 0,nb_transportation_scaled,nb_shopping_scaled,nb_restaurant_scaled,nb_scenicSpot_scaled,nb_stadiumAndGym_scaled,nb_mobike_scaled,green_space_scaled,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
0,-0.757151,-0.598362,-0.721806,-0.405934,-0.747588,-0.808407,0.265348,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
1,-0.838055,-0.682104,-0.801698,-0.404075,-0.792245,-0.808407,0.265348,1.333333,2.095238,0.952381,0.952381,1.266667,1.111111
2,-0.905242,-0.69263,-0.810939,-0.427699,-0.807475,-0.808395,0.265348,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
3,-0.911484,-0.694507,-0.783498,-0.381704,-0.775861,-0.808336,0.265348,1.333333,1.714286,1.142857,1.142857,1.3,1.222222
4,-0.905943,-0.6884,-0.805164,-0.421503,-0.800932,-0.80837,0.265348,1.0,1.571429,1.0,0.428571,1.1,0.666667


In [206]:
df_predict['happiness_clean'].describe()

count    129.000000
mean       2.062544
std        0.314851
min        1.380952
25%        1.836735
50%        2.035714
75%        2.285714
max        3.000000
Name: happiness_clean, dtype: float64

# 1. Classification

## a. Happiness when clean, smell, noise have the same coefficient

In [207]:
X = df_predict[['nb_transportation_scaled', 'nb_shopping_scaled', 'nb_restaurant_scaled', 'nb_scenicSpot_scaled', 'nb_stadiumAndGym_scaled', 'nb_mobike_scaled', 'green_space_scaled']] #features
Y_coeff = df_predict['happiness_equalCoff'] #target

In [208]:
Y_coeff.describe()

count    129.000000
mean       1.389260
std        0.183787
min        1.000000
25%        1.333333
50%        1.333333
75%        1.500000
max        1.666667
Name: happiness_equalCoff, dtype: float64

In [209]:
Z_coeff = pd.DataFrame.copy(Y_coeff)

In [210]:
Z_coeff = Y_coeff.apply(lambda x:"Very unhappy" if x < 1.165 else 
        "Unhappy" if (x >= 1.165 and x < 1.33) else
        "Happy" if (x >= 1.33 and x < 1.495) else
        "Very happy")

In [211]:
df_predict.insert(8, "class_samecoeff", Z_coeff)

In [212]:
df_predict.groupby('class_samecoeff').size()

class_samecoeff
Happy           52
Unhappy         21
Very happy      45
Very unhappy    11
dtype: int64

## b. Happiness when CLEAN has more importance than noise, smell

In [213]:
Y_clean = df_predict['happiness_clean'] #target

In [214]:
Y_clean.describe()

count    129.000000
mean       2.062544
std        0.314851
min        1.380952
25%        1.836735
50%        2.035714
75%        2.285714
max        3.000000
Name: happiness_clean, dtype: float64

In [215]:
Z_clean = pd.DataFrame.copy(Y_clean)
Z_clean = Y_clean.apply(lambda x:"Very unhappy" if x < 1.785714 else 
        "Unhappy" if (x >= 1.1785714 and x < 2.190476) else
        "Happy" if (x >= 2.190476 and x < 2.595238) else
        "Very happy")

In [216]:
df_predict.insert(10, "class_clean", Z_clean)

In [217]:
df_predict.groupby('class_clean').size()

class_clean
Happy           40
Unhappy         55
Very happy       5
Very unhappy    29
dtype: int64

## c. Happiness when SMELL has more importance than clean, noise

In [218]:
Y_smell = df_predict['happiness_smell'] #target

In [219]:
Y_smell.describe()

count    129.000000
mean       1.083657
std        0.171013
min        0.428571
25%        1.000000
50%        1.142857
75%        1.214286
max        1.285714
Name: happiness_smell, dtype: float64

In [220]:
print((Y_smell.max()-Y_smell.min())/4)
print(Y_smell.min() + ((Y_smell.max()-Y_smell.min())/4))
print(Y_smell.min() + ((Y_smell.max()-Y_smell.min())/4)*2)
print(Y_smell.min() + ((Y_smell.max()-Y_smell.min())/4)*3)

0.2142857142857143
0.6428571428571428
0.8571428571428572
1.0714285714285714


In [221]:
Z_smell = pd.DataFrame.copy(Y_smell)
Z_smell = Y_smell.apply(lambda x:"Very unhappy" if x < 0.6428571428571428 else 
        "Unhappy" if (x >= 0.6428571428571428 and x < 0.8571428571428572) else
        "Happy" if (x >= 0.8571428571428572 and x < 1.0714285714285714) else
        "Very happy")

In [222]:
df_predict.insert(12, "class_smell", Z_smell)

In [223]:
df_predict.groupby('class_smell').size()

class_smell
Happy           33
Unhappy         14
Very happy      80
Very unhappy     2
dtype: int64

## d. Happiness when NOISE has more importance than clean, smell

In [224]:
Y_noise = df_predict['happiness_noise'] #target

In [225]:
Y_noise.describe()

count    129.000000
mean       1.021580
std        0.234248
min        0.428571
25%        0.904762
50%        1.102041
75%        1.196429
max        1.285714
Name: happiness_noise, dtype: float64

In [226]:
print((Y_noise.max()-Y_noise.min())/4)
print(Y_noise.min() + ((Y_noise.max()-Y_noise.min())/4))
print(Y_noise.min() + ((Y_noise.max()-Y_noise.min())/4)*2)
print(Y_noise.min() + ((Y_noise.max()-Y_noise.min())/4)*3)

0.2142857142857143
0.6428571428571428
0.8571428571428572
1.0714285714285714


In [228]:
Z_noise = pd.DataFrame.copy(Y_noise)
Z_noise = Y_noise.apply(lambda x:"Very unhappy" if x < 0.6428571428571428 else 
        "Unhappy" if (x >= 0.6428571428571428 and x < 0.8571428571428572) else
        "Happy" if (x >= 0.8571428571428572 and x < 1.0714285714285714) else
        "Very happy")

In [229]:
df_predict.insert(14, "class_noise", Z_noise)

In [230]:
df_predict.groupby('class_noise').size()

class_noise
Happy           30
Unhappy         13
Very happy      73
Very unhappy    13
dtype: int64

## e. Happiness - personnal opinion

In [231]:
Y_perso = df_predict['happiness_perso'] #target

In [232]:
Y_perso.describe()

count    129.000000
mean       1.346643
std        0.177887
min        0.800000
25%        1.250000
50%        1.325000
75%        1.450000
max        1.700000
Name: happiness_perso, dtype: float64

In [233]:
print((Y_perso.max()-Y_perso.min())/4)
print(Y_perso.min() + ((Y_perso.max()-Y_perso.min())/4))
print(Y_perso.min() + ((Y_perso.max()-Y_perso.min())/4)*2)
print(Y_perso.min() + ((Y_perso.max()-Y_perso.min())/4)*3)

0.22499999999999998
1.025
1.25
1.475


In [235]:
Z_perso = pd.DataFrame.copy(Y_perso)
Z_perso = Y_perso.apply(lambda x:"Very unhappy" if x < 1.025 else 
        "Unhappy" if (x >= 1.025 and x < 1.25) else
        "Happy" if (x >= 1.25 and x < 1.475) else
        "Very happy")

In [236]:
df_predict.insert(16, "class_perso", Z_perso)

In [237]:
df_predict.groupby('class_perso').size()

class_perso
Happy           66
Unhappy         25
Very happy      32
Very unhappy     6
dtype: int64

## f. Happiness - arbitary choice

In [238]:
Y_other = df_predict['happiness_other'] #target

In [239]:
print((Y_other.max()-Y_other.min())/4)
print(Y_other.min() + ((Y_other.max()-Y_other.min())/4))
print(Y_other.min() + ((Y_other.max()-Y_other.min())/4)*2)
print(Y_other.min() + ((Y_other.max()-Y_other.min())/4)*3)

0.19444444444444445
0.861111111111111
1.0555555555555556
1.25


In [240]:
Z_other = pd.DataFrame.copy(Y_other)
Z_other = Y_other.apply(lambda x:"Very unhappy" if x < 0.861111111111111 else 
        "Unhappy" if (x >= 0.861111111111111 and x < 1.0555555555555556) else
        "Happy" if (x >= 1.0555555555555556 and x < 1.25) else
        "Very happy")

In [241]:
df_predict.insert(18, "class_other", Z_other)

In [242]:
df_predict.groupby('class_other').size()

class_other
Happy           53
Unhappy         16
Very happy      49
Very unhappy    11
dtype: int64

In [243]:
df_predict.head()

Unnamed: 0,nb_transportation_scaled,nb_shopping_scaled,nb_restaurant_scaled,nb_scenicSpot_scaled,nb_stadiumAndGym_scaled,nb_mobike_scaled,green_space_scaled,happiness_equalCoff,class_samecoeff,happiness_clean,class_clean,happiness_smell,class_smell,happiness_noise,class_noise,happiness_perso,class_perso,happiness_other,class_other
0,-0.757151,-0.598362,-0.721806,-0.405934,-0.747588,-0.808407,0.265348,1.666667,Very happy,2.428571,Happy,1.285714,Very happy,1.285714,Very happy,1.6,Very happy,1.444444,Very happy
1,-0.838055,-0.682104,-0.801698,-0.404075,-0.792245,-0.808407,0.265348,1.333333,Happy,2.095238,Unhappy,0.952381,Happy,0.952381,Happy,1.266667,Happy,1.111111,Happy
2,-0.905242,-0.69263,-0.810939,-0.427699,-0.807475,-0.808395,0.265348,1.666667,Very happy,2.428571,Happy,1.285714,Very happy,1.285714,Very happy,1.6,Very happy,1.444444,Very happy
3,-0.911484,-0.694507,-0.783498,-0.381704,-0.775861,-0.808336,0.265348,1.333333,Happy,1.714286,Very unhappy,1.142857,Very happy,1.142857,Very happy,1.3,Happy,1.222222,Happy
4,-0.905943,-0.6884,-0.805164,-0.421503,-0.800932,-0.80837,0.265348,1.0,Very unhappy,1.571429,Very unhappy,1.0,Happy,0.428571,Very unhappy,1.1,Unhappy,0.666667,Very unhappy


In [245]:
df_predict.to_pickle("df_prediction_classification.pkl")

# 2. Split data into training and test sets

In [343]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

## a. Target: Happiness_samecoeff

In [244]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_coeff_train, Z_coeff_test = train_test_split(X, Z_coeff, test_size=0.3)

### i. Decision Tree

In [283]:
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X_train, Z_coeff_train)
print("Best parameters" + str(search.best_params_))

Best parameters{'max_depth': 4}


In [284]:
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_coeff_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_coeff_test)

0.5128205128205128

In [285]:
print('Classification report: ')
print(classification_report(Z_coeff_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.53      0.89      0.67        18
     Unhappy       0.00      0.00      0.00         5
  Very happy       0.57      0.31      0.40        13
Very unhappy       0.00      0.00      0.00         3

    accuracy                           0.51        39
   macro avg       0.28      0.30      0.27        39
weighted avg       0.44      0.51      0.44        39



In [312]:
scores = cross_val_score(clf, X, Z_coeff, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.3846153846153846

### ii. Random Forest

In [313]:
param_grid = {'max_depth': [12,14, 16], 'n_estimators' : [12,14,16]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_coeff)
print("Best parameters " + str(search.best_params_))

Best parameters {'max_depth': 12, 'n_estimators': 12}


In [332]:
rfc = RandomForestClassifier(max_depth=3, n_estimators = 3)
rfc = rfc.fit(X_train, Z_coeff_train)
prediction = rfc.predict(X_test)
np.mean(prediction == Z_coeff_test)

0.5384615384615384

In [315]:
print('Classification report: ')
print(classification_report(Z_coeff_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.53      0.89      0.67        18
     Unhappy       0.00      0.00      0.00         5
  Very happy       0.57      0.31      0.40        13
Very unhappy       0.00      0.00      0.00         3

    accuracy                           0.51        39
   macro avg       0.28      0.30      0.27        39
weighted avg       0.44      0.51      0.44        39



In [333]:
scores = cross_val_score(rfc, X, Z_coeff, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.36923076923076925

### iii. KNN

In [336]:
param_grid = {'n_neighbors': [12,25,30]}
search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_coeff)
print("Best parameters " + str(search.best_params_))

Best parameters {'n_neighbors': 12}


In [337]:
knn = KNeighborsClassifier(n_neighbors=12)
knn = knn.fit(X_train, Z_coeff_train)
prediction = knn.predict(X_test)
np.mean(prediction == Z_coeff_test)

0.41025641025641024

In [339]:
from sklearn import neighbors, datasets
n_neighbors = 30
knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
scores = cross_val_score(knn, X, Z_coeff, cv=5)
np.mean(scores)

0.3356043956043956

### iv. Linear SVC

In [330]:
from sklearn.svm import LinearSVC
param_grid = {'C': [900,950,1000]}
search = GridSearchCV(LinearSVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X_train, Z_coeff_train)
print("Best parameters " + str(search.best_params_))



Best parameters {'C': 1000}




In [345]:
svc = LinearSVC(C=1000)
svc = svc.fit(X_train, Z_coeff_train)
prediction = svc.predict(X_test)
np.mean(prediction == Z_coeff_test)



0.48717948717948717

## b. Target: Happiness_clean

### i. Decision Tree

In [429]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_clean_train, Z_clean_test = train_test_split(X, Z_clean, test_size=0.3)

In [430]:
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X_train, Z_clean_train)
print("Best parameters" + str(search.best_params_))

Best parameters{'max_depth': 4}


In [431]:
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_clean_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_clean_test)

0.46153846153846156

In [432]:
print('Classification report: ')
print(classification_report(Z_clean_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.33      0.38      0.35         8
     Unhappy       0.56      0.78      0.65        18
  Very happy       0.00      0.00      0.00         1
Very unhappy       0.20      0.08      0.12        12

    accuracy                           0.46        39
   macro avg       0.27      0.31      0.28        39
weighted avg       0.39      0.46      0.41        39



  'precision', 'predicted', average, warn_for)


In [433]:
scores = cross_val_score(clf, X, Z_clean, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.49230769230769234

### ii. Random Forest

In [374]:
param_grid = {'max_depth': [12,14, 16], 'n_estimators' : [12,14,16]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_clean)
print("Best parameters " + str(search.best_params_))

Best parameters {'max_depth': 12, 'n_estimators': 12}


In [375]:
rfc = RandomForestClassifier(max_depth=12, n_estimators = 12)
rfc = rfc.fit(X_train, Z_clean_train)
prediction = rfc.predict(X_test)
np.mean(prediction == Z_clean_test)

0.6153846153846154

In [376]:
print('Classification report: ')
print(classification_report(Z_clean_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.50      0.50      0.50        12
     Unhappy       0.67      0.78      0.72        18
  Very happy       0.00      0.00      0.00         1
Very unhappy       0.67      0.50      0.57         8

    accuracy                           0.62        39
   macro avg       0.46      0.44      0.45        39
weighted avg       0.60      0.62      0.60        39



  'precision', 'predicted', average, warn_for)


In [377]:
scores = cross_val_score(rfc, X, Z_clean, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.6461538461538462

### iii. KNN

In [378]:
param_grid = {'n_neighbors': [12,25,30]}
search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_clean)
print("Best parameters " + str(search.best_params_))

Best parameters {'n_neighbors': 12}


In [379]:
knn = KNeighborsClassifier(n_neighbors=12)
knn = knn.fit(X_train, Z_clean_train)
prediction = knn.predict(X_test)
np.mean(prediction == Z_clean_test)

0.5384615384615384

In [381]:
print('Classification report: ')
print(classification_report(Z_clean_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.43      0.25      0.32        12
     Unhappy       0.67      0.78      0.72        18
  Very happy       0.00      0.00      0.00         1
Very unhappy       0.36      0.50      0.42         8

    accuracy                           0.54        39
   macro avg       0.36      0.38      0.36        39
weighted avg       0.51      0.54      0.51        39



  'precision', 'predicted', average, warn_for)


In [380]:
from sklearn import neighbors, datasets
n_neighbors = 30
knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
scores = cross_val_score(knn, X, Z_clean, cv=5)
np.mean(scores)

0.4338461538461538

### iv. Linear SVC

In [386]:
from sklearn.svm import LinearSVC
param_grid = {'C': [900,950,1000]}
search = GridSearchCV(LinearSVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X_train, Z_clean_train)
print("Best parameters " + str(search.best_params_))



Best parameters {'C': 950}




In [387]:
svc = LinearSVC(C=950)
svc = svc.fit(X_train, Z_clean_train)
prediction = svc.predict(X_test)
np.mean(prediction == Z_clean_test)



0.5384615384615384

## c. Target: Happiness_smell

In [388]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_smell_train, Z_smell_test = train_test_split(X, Z_smell, test_size=0.3)

### i. Decision tree

In [436]:
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X_train, Z_smell_train)
print("Best parameters" + str(search.best_params_))

Best parameters{'max_depth': 1}


In [437]:
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_smell_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_smell_test)

0.5897435897435898

In [438]:
print('Classification report: ')
print(classification_report(Z_smell_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.00      0.00      0.00        10
     Unhappy       0.00      0.00      0.00         5
  Very happy       0.59      1.00      0.74        23
Very unhappy       0.00      0.00      0.00         1

    accuracy                           0.59        39
   macro avg       0.15      0.25      0.19        39
weighted avg       0.35      0.59      0.44        39



  'precision', 'predicted', average, warn_for)


In [440]:
scores = cross_val_score(clf, X, Z_smell, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.6307692307692307

### ii. Random forest

In [395]:
param_grid = {'max_depth': [10,12,20], 'n_estimators' : [12,14,16]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_smell)
print("Best parameters " + str(search.best_params_))

Best parameters {'max_depth': 12, 'n_estimators': 14}


In [396]:
rfc = RandomForestClassifier(max_depth=12, n_estimators = 14)
rfc = rfc.fit(X_train, Z_smell_train)
prediction = rfc.predict(X_test)
np.mean(prediction == Z_smell_test)

0.4358974358974359

In [397]:
print('Classification report: ')
print(classification_report(Z_smell_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.23      0.30      0.26        10
     Unhappy       0.33      0.20      0.25         5
  Very happy       0.57      0.57      0.57        23
Very unhappy       0.00      0.00      0.00         1

    accuracy                           0.44        39
   macro avg       0.28      0.27      0.27        39
weighted avg       0.44      0.44      0.43        39



  'precision', 'predicted', average, warn_for)


In [398]:
scores = cross_val_score(rfc, X, Z_smell, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.476923076923077

### iii. KNN

In [401]:
param_grid = {'n_neighbors': [20,25,30]}
search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_smell)
print("Best parameters " + str(search.best_params_))

Best parameters {'n_neighbors': 25}


In [402]:
knn = KNeighborsClassifier(n_neighbors=25)
knn = knn.fit(X_train, Z_smell_train)
prediction = knn.predict(X_test)
np.mean(prediction == Z_smell_test)

0.5897435897435898

In [403]:
print('Classification report: ')
print(classification_report(Z_smell_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.00      0.00      0.00        10
     Unhappy       0.00      0.00      0.00         5
  Very happy       0.59      1.00      0.74        23
Very unhappy       0.00      0.00      0.00         1

    accuracy                           0.59        39
   macro avg       0.15      0.25      0.19        39
weighted avg       0.35      0.59      0.44        39



  'precision', 'predicted', average, warn_for)


In [404]:
from sklearn import neighbors, datasets
n_neighbors = 30
knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
scores = cross_val_score(knn, X, Z_smell, cv=5)
np.mean(scores)



0.6214472934472934

### iv. Linear SVC

In [411]:
from sklearn.svm import LinearSVC
param_grid = {'C': [10,40,50]}
search = GridSearchCV(LinearSVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X_train, Z_smell_train)
print("Best parameters " + str(search.best_params_))



Best parameters {'C': 10}




In [413]:
svc = LinearSVC(C=10)
svc = svc.fit(X_train, Z_smell_train)
prediction = svc.predict(X_test)
np.mean(prediction == Z_smell_test)



0.5641025641025641

## d. Target: Happiness_noise

In [407]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_noise_train, Z_noise_test = train_test_split(X, Z_noise, test_size=0.3)

### i. Decision Tree

In [449]:
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X_train, Z_noise_train)
print("Best parameters" + str(search.best_params_))

Best parameters{'max_depth': 1}


In [450]:
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_noise_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_noise_test)

0.2564102564102564

In [451]:
print('Classification report: ')
print(classification_report(Z_smell_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.00      0.00      0.00        10
     Unhappy       0.29      0.40      0.33         5
  Very happy       0.00      0.00      0.00        23
Very unhappy       0.03      1.00      0.06         1

    accuracy                           0.08        39
   macro avg       0.08      0.35      0.10        39
weighted avg       0.04      0.08      0.04        39



  'precision', 'predicted', average, warn_for)


In [452]:
scores = cross_val_score(clf, X, Z_noise, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.49230769230769234

### ii. Random Forest

In [422]:
param_grid = {'max_depth': [10,12,20], 'n_estimators' : [12,14,16]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_noise)
print("Best parameters " + str(search.best_params_))

Best parameters {'max_depth': 20, 'n_estimators': 16}


In [423]:
rfc = RandomForestClassifier(max_depth=20, n_estimators = 16)
rfc = rfc.fit(X_train, Z_noise_train)
prediction = rfc.predict(X_test)
np.mean(prediction == Z_noise_test)

0.4358974358974359

In [425]:
print('Classification report: ')
print(classification_report(Z_smell_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.50      0.20      0.29        10
     Unhappy       0.10      0.40      0.16         5
  Very happy       0.00      0.00      0.00        23
Very unhappy       0.00      0.00      0.00         1

    accuracy                           0.10        39
   macro avg       0.15      0.15      0.11        39
weighted avg       0.14      0.10      0.09        39



  'precision', 'predicted', average, warn_for)


In [424]:
scores = cross_val_score(rfc, X, Z_noise, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.4153846153846154

### iii. KNN

In [418]:
param_grid = {'n_neighbors': [20,25,30]}
search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_noise)
print("Best parameters " + str(search.best_params_))

Best parameters {'n_neighbors': 25}


In [419]:
knn = KNeighborsClassifier(n_neighbors=25)
knn = knn.fit(X_train, Z_noise_train)
prediction = knn.predict(X_test)
np.mean(prediction == Z_noise_test)

0.48717948717948717

In [420]:
print('Classification report: ')
print(classification_report(Z_smell_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.00      0.00      0.00        10
     Unhappy       0.13      0.60      0.21         5
  Very happy       0.00      0.00      0.00        23
Very unhappy       0.06      1.00      0.12         1

    accuracy                           0.10        39
   macro avg       0.05      0.40      0.08        39
weighted avg       0.02      0.10      0.03        39



  'precision', 'predicted', average, warn_for)


In [421]:
from sklearn import neighbors, datasets
n_neighbors = 25
knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
scores = cross_val_score(knn, X, Z_noise, cv=5)
np.mean(scores)

0.4874074074074074

### iv. Linear SVC

In [410]:
from sklearn.svm import LinearSVC
param_grid = {'C': [10,20,40]}
search = GridSearchCV(LinearSVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X_train, Z_noise_train)
print("Best parameters " + str(search.best_params_))



Best parameters {'C': 10}




In [414]:
svc = LinearSVC(C=10)
svc = svc.fit(X_train, Z_noise_train)
prediction = svc.predict(X_test)
np.mean(prediction == Z_noise_test)



0.48717948717948717

In [415]:
print('Classification report: ')
print(classification_report(Z_smell_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.00      0.00      0.00        10
     Unhappy       0.09      0.20      0.13         5
  Very happy       0.00      0.00      0.00        23
Very unhappy       0.04      1.00      0.07         1

    accuracy                           0.05        39
   macro avg       0.03      0.30      0.05        39
weighted avg       0.01      0.05      0.02        39



  'precision', 'predicted', average, warn_for)


## e. Target: Happiness_perso

In [417]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_perso_train, Z_perso_test = train_test_split(X, Z_perso, test_size=0.3)

### i. Decision Tree

In [464]:
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X_train, Z_perso_train)
print("Best parameters" + str(search.best_params_))

Best parameters{'max_depth': 1}


In [465]:
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_perso_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_perso_test)

0.48717948717948717

In [466]:
print('Classification report: ')
print(classification_report(Z_perso_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.49      1.00      0.66        19
     Unhappy       0.00      0.00      0.00         8
  Very happy       0.00      0.00      0.00        10
Very unhappy       0.00      0.00      0.00         2

    accuracy                           0.49        39
   macro avg       0.12      0.25      0.16        39
weighted avg       0.24      0.49      0.32        39



  'precision', 'predicted', average, warn_for)


In [467]:
scores = cross_val_score(clf, X, Z_perso, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.4153846153846154

### ii. Random Forest

In [460]:
param_grid = {'max_depth': [10,12,20], 'n_estimators' : [12,14,16]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_perso)
print("Best parameters " + str(search.best_params_))

Best parameters {'max_depth': 20, 'n_estimators': 14}


In [461]:
rfc = RandomForestClassifier(max_depth=20, n_estimators = 14)
rfc = rfc.fit(X_train, Z_perso_train)
prediction = rfc.predict(X_test)
np.mean(prediction == Z_perso_test)

0.41025641025641024

In [462]:
print('Classification report: ')
print(classification_report(Z_perso_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.47      0.74      0.57        19
     Unhappy       0.00      0.00      0.00         8
  Very happy       0.14      0.10      0.12        10
Very unhappy       0.50      0.50      0.50         2

    accuracy                           0.41        39
   macro avg       0.28      0.33      0.30        39
weighted avg       0.29      0.41      0.33        39



  'precision', 'predicted', average, warn_for)


In [463]:
scores = cross_val_score(rfc, X, Z_perso, cv=ShuffleSplit(n_splits=5))
np.mean(scores)

0.4

### iii. KNN

In [456]:
param_grid = {'n_neighbors': [20,25,30]}
search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X, Z_perso)
print("Best parameters " + str(search.best_params_))

Best parameters {'n_neighbors': 30}


In [457]:
knn = KNeighborsClassifier(n_neighbors=30)
knn = knn.fit(X_train, Z_perso_train)
prediction = knn.predict(X_test)
np.mean(prediction == Z_perso_test)

0.48717948717948717

In [458]:
print('Classification report: ')
print(classification_report(Z_perso_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.49      1.00      0.66        19
     Unhappy       0.00      0.00      0.00         8
  Very happy       0.00      0.00      0.00        10
Very unhappy       0.00      0.00      0.00         2

    accuracy                           0.49        39
   macro avg       0.12      0.25      0.16        39
weighted avg       0.24      0.49      0.32        39



  'precision', 'predicted', average, warn_for)


In [459]:
from sklearn import neighbors, datasets
n_neighbors = 30
knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
scores = cross_val_score(knn, X, Z_perso, cv=5)
np.mean(scores)

0.472

### iv. Linear SVC

In [453]:
from sklearn.svm import LinearSVC
param_grid = {'C': [10,20,40]}
search = GridSearchCV(LinearSVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(X_train, Z_perso_train)
print("Best parameters " + str(search.best_params_))



Best parameters {'C': 10}




In [454]:
svc = LinearSVC(C=10)
svc = svc.fit(X_train, Z_perso_train)
prediction = svc.predict(X_test)
np.mean(prediction == Z_perso_test)



0.38461538461538464

In [455]:
print('Classification report: ')
print(classification_report(Z_perso_test, prediction))

Classification report: 
              precision    recall  f1-score   support

       Happy       0.48      0.74      0.58        19
     Unhappy       0.00      0.00      0.00         8
  Very happy       0.20      0.10      0.13        10
Very unhappy       0.00      0.00      0.00         2

    accuracy                           0.38        39
   macro avg       0.17      0.21      0.18        39
weighted avg       0.29      0.38      0.32        39

