# Project - Predict the wellbeing of shanghainese communities

# MACHINE LEARNING

In [11]:
import pandas as pd
import numpy as np

In [204]:
df_predict = pd.read_pickle("df_prediction.pkl")

In [205]:
df_predict.head()

Unnamed: 0,nb_transportation_scaled,nb_shopping_scaled,nb_restaurant_scaled,nb_scenicSpot_scaled,nb_stadiumAndGym_scaled,nb_mobike_scaled,green_space_scaled,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
0,-0.757151,-0.598362,-0.721806,-0.405934,-0.747588,-0.808407,0.265348,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
1,-0.838055,-0.682104,-0.801698,-0.404075,-0.792245,-0.808407,0.265348,1.333333,2.095238,0.952381,0.952381,1.266667,1.111111
2,-0.905242,-0.69263,-0.810939,-0.427699,-0.807475,-0.808395,0.265348,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
3,-0.911484,-0.694507,-0.783498,-0.381704,-0.775861,-0.808336,0.265348,1.333333,1.714286,1.142857,1.142857,1.3,1.222222
4,-0.905943,-0.6884,-0.805164,-0.421503,-0.800932,-0.80837,0.265348,1.0,1.571429,1.0,0.428571,1.1,0.666667


In [206]:
df_predict['happiness_clean'].describe()

count    129.000000
mean       2.062544
std        0.314851
min        1.380952
25%        1.836735
50%        2.035714
75%        2.285714
max        3.000000
Name: happiness_clean, dtype: float64

# 1. Classification

## a. Happiness when clean, smell, noise have the same coefficient

In [207]:
X = df_predict[['nb_transportation_scaled', 'nb_shopping_scaled', 'nb_restaurant_scaled', 'nb_scenicSpot_scaled', 'nb_stadiumAndGym_scaled', 'nb_mobike_scaled', 'green_space_scaled']] #features
Y_coeff = df_predict['happiness_equalCoff'] #target

In [208]:
Y_coeff.describe()

count    129.000000
mean       1.389260
std        0.183787
min        1.000000
25%        1.333333
50%        1.333333
75%        1.500000
max        1.666667
Name: happiness_equalCoff, dtype: float64

In [209]:
Z_coeff = pd.DataFrame.copy(Y_coeff)

In [210]:
Z_coeff = Y_coeff.apply(lambda x:"Very unhappy" if x < 1.165 else 
        "Unhappy" if (x >= 1.165 and x < 1.33) else
        "Happy" if (x >= 1.33 and x < 1.495) else
        "Very happy")

In [211]:
df_predict.insert(8, "class_samecoeff", Z_coeff)

In [212]:
df_predict.groupby('class_samecoeff').size()

class_samecoeff
Happy           52
Unhappy         21
Very happy      45
Very unhappy    11
dtype: int64

## b. Happiness when CLEAN has more importance than noise, smell

In [213]:
Y_clean = df_predict['happiness_clean'] #target

In [214]:
Y_clean.describe()

count    129.000000
mean       2.062544
std        0.314851
min        1.380952
25%        1.836735
50%        2.035714
75%        2.285714
max        3.000000
Name: happiness_clean, dtype: float64

In [215]:
Z_clean = pd.DataFrame.copy(Y_clean)
Z_clean = Y_clean.apply(lambda x:"Very unhappy" if x < 1.785714 else 
        "Unhappy" if (x >= 1.1785714 and x < 2.190476) else
        "Happy" if (x >= 2.190476 and x < 2.595238) else
        "Very happy")

In [216]:
df_predict.insert(10, "class_clean", Z_clean)

In [217]:
df_predict.groupby('class_clean').size()

class_clean
Happy           40
Unhappy         55
Very happy       5
Very unhappy    29
dtype: int64

## c. Happiness when SMELL has more importance than clean, noise

In [218]:
Y_smell = df_predict['happiness_smell'] #target

In [219]:
Y_smell.describe()

count    129.000000
mean       1.083657
std        0.171013
min        0.428571
25%        1.000000
50%        1.142857
75%        1.214286
max        1.285714
Name: happiness_smell, dtype: float64

In [220]:
print((Y_smell.max()-Y_smell.min())/4)
print(Y_smell.min() + ((Y_smell.max()-Y_smell.min())/4))
print(Y_smell.min() + ((Y_smell.max()-Y_smell.min())/4)*2)
print(Y_smell.min() + ((Y_smell.max()-Y_smell.min())/4)*3)

0.2142857142857143
0.6428571428571428
0.8571428571428572
1.0714285714285714


In [221]:
Z_smell = pd.DataFrame.copy(Y_smell)
Z_smell = Y_smell.apply(lambda x:"Very unhappy" if x < 0.6428571428571428 else 
        "Unhappy" if (x >= 0.6428571428571428 and x < 0.8571428571428572) else
        "Happy" if (x >= 0.8571428571428572 and x < 1.0714285714285714) else
        "Very happy")

In [222]:
df_predict.insert(12, "class_smell", Z_smell)

In [223]:
df_predict.groupby('class_smell').size()

class_smell
Happy           33
Unhappy         14
Very happy      80
Very unhappy     2
dtype: int64

## d. Happiness when NOISE has more importance than clean, smell

In [224]:
Y_noise = df_predict['happiness_noise'] #target

In [225]:
Y_noise.describe()

count    129.000000
mean       1.021580
std        0.234248
min        0.428571
25%        0.904762
50%        1.102041
75%        1.196429
max        1.285714
Name: happiness_noise, dtype: float64

In [226]:
print((Y_noise.max()-Y_noise.min())/4)
print(Y_noise.min() + ((Y_noise.max()-Y_noise.min())/4))
print(Y_noise.min() + ((Y_noise.max()-Y_noise.min())/4)*2)
print(Y_noise.min() + ((Y_noise.max()-Y_noise.min())/4)*3)

0.2142857142857143
0.6428571428571428
0.8571428571428572
1.0714285714285714


In [228]:
Z_noise = pd.DataFrame.copy(Y_noise)
Z_noise = Y_noise.apply(lambda x:"Very unhappy" if x < 0.6428571428571428 else 
        "Unhappy" if (x >= 0.6428571428571428 and x < 0.8571428571428572) else
        "Happy" if (x >= 0.8571428571428572 and x < 1.0714285714285714) else
        "Very happy")

In [229]:
df_predict.insert(14, "class_noise", Z_noise)

In [230]:
df_predict.groupby('class_noise').size()

class_noise
Happy           30
Unhappy         13
Very happy      73
Very unhappy    13
dtype: int64

## e. Happiness - personnal opinion

In [231]:
Y_perso = df_predict['happiness_perso'] #target

In [232]:
Y_perso.describe()

count    129.000000
mean       1.346643
std        0.177887
min        0.800000
25%        1.250000
50%        1.325000
75%        1.450000
max        1.700000
Name: happiness_perso, dtype: float64

In [233]:
print((Y_perso.max()-Y_perso.min())/4)
print(Y_perso.min() + ((Y_perso.max()-Y_perso.min())/4))
print(Y_perso.min() + ((Y_perso.max()-Y_perso.min())/4)*2)
print(Y_perso.min() + ((Y_perso.max()-Y_perso.min())/4)*3)

0.22499999999999998
1.025
1.25
1.475


In [235]:
Z_perso = pd.DataFrame.copy(Y_perso)
Z_perso = Y_perso.apply(lambda x:"Very unhappy" if x < 1.025 else 
        "Unhappy" if (x >= 1.025 and x < 1.25) else
        "Happy" if (x >= 1.25 and x < 1.475) else
        "Very happy")

In [236]:
df_predict.insert(16, "class_perso", Z_perso)

In [237]:
df_predict.groupby('class_perso').size()

class_perso
Happy           66
Unhappy         25
Very happy      32
Very unhappy     6
dtype: int64

## f. Happiness - arbitary choice

In [238]:
Y_other = df_predict['happiness_other'] #target

In [239]:
print((Y_other.max()-Y_other.min())/4)
print(Y_other.min() + ((Y_other.max()-Y_other.min())/4))
print(Y_other.min() + ((Y_other.max()-Y_other.min())/4)*2)
print(Y_other.min() + ((Y_other.max()-Y_other.min())/4)*3)

0.19444444444444445
0.861111111111111
1.0555555555555556
1.25


In [240]:
Z_other = pd.DataFrame.copy(Y_other)
Z_other = Y_other.apply(lambda x:"Very unhappy" if x < 0.861111111111111 else 
        "Unhappy" if (x >= 0.861111111111111 and x < 1.0555555555555556) else
        "Happy" if (x >= 1.0555555555555556 and x < 1.25) else
        "Very happy")

In [241]:
df_predict.insert(18, "class_other", Z_other)

In [242]:
df_predict.groupby('class_other').size()

class_other
Happy           53
Unhappy         16
Very happy      49
Very unhappy    11
dtype: int64

In [243]:
df_predict.head()

Unnamed: 0,nb_transportation_scaled,nb_shopping_scaled,nb_restaurant_scaled,nb_scenicSpot_scaled,nb_stadiumAndGym_scaled,nb_mobike_scaled,green_space_scaled,happiness_equalCoff,class_samecoeff,happiness_clean,class_clean,happiness_smell,class_smell,happiness_noise,class_noise,happiness_perso,class_perso,happiness_other,class_other
0,-0.757151,-0.598362,-0.721806,-0.405934,-0.747588,-0.808407,0.265348,1.666667,Very happy,2.428571,Happy,1.285714,Very happy,1.285714,Very happy,1.6,Very happy,1.444444,Very happy
1,-0.838055,-0.682104,-0.801698,-0.404075,-0.792245,-0.808407,0.265348,1.333333,Happy,2.095238,Unhappy,0.952381,Happy,0.952381,Happy,1.266667,Happy,1.111111,Happy
2,-0.905242,-0.69263,-0.810939,-0.427699,-0.807475,-0.808395,0.265348,1.666667,Very happy,2.428571,Happy,1.285714,Very happy,1.285714,Very happy,1.6,Very happy,1.444444,Very happy
3,-0.911484,-0.694507,-0.783498,-0.381704,-0.775861,-0.808336,0.265348,1.333333,Happy,1.714286,Very unhappy,1.142857,Very happy,1.142857,Very happy,1.3,Happy,1.222222,Happy
4,-0.905943,-0.6884,-0.805164,-0.421503,-0.800932,-0.80837,0.265348,1.0,Very unhappy,1.571429,Very unhappy,1.0,Happy,0.428571,Very unhappy,1.1,Unhappy,0.666667,Very unhappy


# 2. Split data into training and test sets

In [90]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

## a. Target: Happiness_samecoeff

In [244]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_coeff_train, Z_coeff_test = train_test_split(X, Z_coeff, test_size=0.3)

### i. Decision Tree

In [102]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X,Z_coeff)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=None, train_size=None),
             error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 2, 4, 

In [104]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_coeff_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_coeff_test)

0.3333333333333333

In [94]:
models = []

In [95]:
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC(gamma='auto')))

In [96]:
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Z_coeff_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))



KNN: 0.360051 (0.113328)
SVM: 0.467677 (0.026552)


In [98]:
model = SVC(gamma='auto')
model.fit(X_train, Z_coeff_train)
predictions = model.predict(X_test)

In [100]:
print('Accuracy score: ', accuracy_score(Z_coeff_test, predictions))
print('Confusion Matrix: ')
print(confusion_matrix(Z_coeff_test, predictions))
print('Classification report: ')
print(classification_report(Z_coeff_test, predictions))

Accuracy score:  0.46153846153846156
Confusion Matrix: 
[[ 0  0  1  9]
 [ 1  0  0  0]
 [ 0  0  1 10]
 [ 0  0  0 17]]
Classification report: 
              precision    recall  f1-score   support

       Happy       0.00      0.00      0.00        10
     Unhappy       0.00      0.00      0.00         1
  Very happy       0.50      0.09      0.15        11
Very unhappy       0.47      1.00      0.64        17

    accuracy                           0.46        39
   macro avg       0.24      0.27      0.20        39
weighted avg       0.35      0.46      0.32        39



  'precision', 'predicted', average, warn_for)


In [None]:
#### NOISE
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_noise)
# Print the best parameters
print("Best parameters for noise: " + str(search.best_params_))

#### SMELL
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_smell)
print("Best parameters for smell: " + str(search.best_params_))

#### CLEAN
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_clean)
print("Best parameters for clean: " + str(search.best_params_))

In [None]:
#### NOISE
rfc = RandomForestClassifier(max_depth=1, n_estimators=1)
# use cross validation
scores = cross_val_score(rfc, data, y_noise, cv=ShuffleSplit(n_splits=5))
# print score
print("Score of noise prediction: " + str(np.mean(scores)))

#### SMELL
rfc = RandomForestClassifier(max_depth=1, n_estimators=1)
scores = cross_val_score(rfc, data, y_smell, cv=ShuffleSplit(n_splits=5))
print("Score of smell prediction: " + str(np.mean(scores)))

#### CLEAN
rfc = RandomForestClassifier(max_depth=1, n_estimators=7)
scores = cross_val_score(rfc, data, y_clean, cv=ShuffleSplit(n_splits=5))
print("Score of clean prediction: " + str(np.mean(scores)))

### i. KNN 

In [None]:
from sklearn import neighbors, datasets
n_neighbors = 2000
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
scores = cross_val_score(clf, X, Z, cv=5)
clf.fit(X, Z)
np.mean(scores)

## b. Target: Happiness_clean

In [110]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_clean_train, Z_clean_test = train_test_split(X, Z_clean, test_size=0.3)

In [111]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X,Z_clean)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=None, train_size=None),
             error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 2, 4, 

In [112]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_clean_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_clean_test)

0.20512820512820512

## c. Target: Happiness_smell

In [107]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_smell_train, Z_smell_test = train_test_split(X, Z_smell, test_size=0.3)

In [108]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
param_grid = {'max_depth': [1,2,4,6,8,10,20,40,100]}
search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv= ShuffleSplit(n_splits=5))
search.fit(X,Z_smell)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=None, train_size=None),
             error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 2, 4, 

In [109]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=search.best_params_['max_depth'])
clf = clf.fit(X_train, Z_smell_train)
prediction = clf.predict(X_test)
np.mean(prediction == Z_smell_test)

0.6923076923076923

In [113]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
tree.plot_tree(clf)

[Text(581.25, 577.5, 'X[5] <= -0.747\ngini = 0.745\nsamples = 90\nvalue = [25, 26, 21, 18]'),
 Text(290.625, 192.5, 'gini = 0.702\nsamples = 35\nvalue = [4, 6, 12, 13]'),
 Text(871.875, 192.5, 'gini = 0.687\nsamples = 55\nvalue = [21, 20, 9, 5]')]

## d. Target: Happiness_noise

## e. Target: Happiness_perso

## f. Target: Happiness_other