In [20]:
import pickle
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

### Get data from pickle files

In [2]:
with open('df_neighborhood.pkl', 'rb') as handle:
    df_neigh = pickle.load(handle)
with open('df_target.pkl', 'rb') as handle:
    df_target = pickle.load(handle)
df_neigh.head()

### Scale the data 

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_neigh['nb_taxi_scale'] = scaler.fit_transform(df_neigh[['nb_taxi_km']])
df_neigh['nb_transp_scale'] = scaler.fit_transform(df_neigh[['nb_transp_km']])
df_neigh['nb_rest_scale'] = scaler.fit_transform(df_neigh[['nb_rest_km']])
df_neigh['nb_shop_scale'] = scaler.fit_transform(df_neigh[['nb_shop_km']])
df_neigh['nb_entertainement_scale'] = scaler.fit_transform(df_neigh[['nb_entertainement_km']])

### Get target values

In [4]:
noise = df_neigh['noise_cat']
y_noise = pd.DataFrame.copy(noise)

clean = df_neigh['clean_cat']
y_clean = pd.DataFrame.copy(clean)

smell = df_neigh['smell_cat']
y_smell = pd.DataFrame.copy(smell)

### Separate DF data and ML data

In [17]:
data = df_neigh[['nb_taxi_scale','nb_transp_scale','nb_rest_scale','nb_shop_scale','nb_entertainement_scale']]
data_no_taxi = df_neigh[['nb_transp_scale','nb_rest_scale','nb_shop_scale','nb_entertainement_scale']]
data_no_rest = df_neigh[['nb_taxi_scale','nb_transp_scale','nb_shop_scale','nb_entertainement_scale']]

### 1.1) For noise / smell / clean predictions using Random Forest --- All data

##### Find the best parameters for the Model

In [10]:
#### NOISE
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_noise)
# Print the best parameters
print("Best parameters for noise: " + str(search.best_params_))

#### SMELL
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_smell)
print("Best parameters for smell: " + str(search.best_params_))

#### CLEAN
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_clean)
print("Best parameters for clean: " + str(search.best_params_))

Best parameters for noise: {'max_depth': 1, 'n_estimators': 1}
Best parameters for smell: {'max_depth': 1, 'n_estimators': 1}
Best parameters for clean: {'max_depth': 1, 'n_estimators': 7}


##### Train and Test with the real Model using Cross Validation

In [11]:
#### NOISE
rfc = RandomForestClassifier(max_depth=1, n_estimators=1)
# use cross validation
scores = cross_val_score(rfc, data, y_noise, cv=ShuffleSplit(n_splits=5))
# print score
print("Score of noise prediction: " + str(np.mean(scores)))

#### SMELL
rfc = RandomForestClassifier(max_depth=1, n_estimators=1)
scores = cross_val_score(rfc, data, y_smell, cv=ShuffleSplit(n_splits=5))
print("Score of smell prediction: " + str(np.mean(scores)))

#### CLEAN
rfc = RandomForestClassifier(max_depth=1, n_estimators=7)
scores = cross_val_score(rfc, data, y_clean, cv=ShuffleSplit(n_splits=5))
print("Score of clean prediction: " + str(np.mean(scores)))

Score of noise prediction: 0.8307692307692308
Score of smell prediction: 0.9692307692307693
Score of clean prediction: 0.6153846153846153


##### Explanation on the results

### 1.2) For noise / smell / clean predictions using Random Forest --- Without taxi data

##### Find the best parameters for the Model

In [15]:
#### NOISE
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_taxi, y_noise)
# Print the best parameters
print("Best parameters for noise: " + str(search.best_params_))

#### SMELL
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_taxi, y_smell)
print("Best parameters for smell: " + str(search.best_params_))

#### CLEAN
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_taxi, y_clean)
print("Best parameters for clean: " + str(search.best_params_))

Best parameters for noise: {'max_depth': 5, 'n_estimators': 7}
Best parameters for smell: {'max_depth': 1, 'n_estimators': 1}
Best parameters for clean: {'max_depth': 1, 'n_estimators': 3}


##### Train and Test with the real Model using Cross Validation

In [16]:
#### NOISE
rfc = RandomForestClassifier(max_depth=5, n_estimators=7)
# use cross validation
scores = cross_val_score(rfc, data_no_taxi, y_noise, cv=ShuffleSplit(n_splits=5))
# print score
print("Score of noise prediction: " + str(np.mean(scores)))

#### SMELL
rfc = RandomForestClassifier(max_depth=1, n_estimators=1)
scores = cross_val_score(rfc, data_no_taxi, y_smell, cv=ShuffleSplit(n_splits=5))
print("Score of smell prediction: " + str(np.mean(scores)))

#### CLEAN
rfc = RandomForestClassifier(max_depth=1, n_estimators=3)
scores = cross_val_score(rfc, data_no_taxi, y_clean, cv=ShuffleSplit(n_splits=5))
print("Score of clean prediction: " + str(np.mean(scores)))

Score of noise prediction: 0.7692307692307692
Score of smell prediction: 0.9846153846153847
Score of clean prediction: 0.6


##### Explanation on the results

### 1.3) For noise / smell / clean predictions using Random Forest --- Without restaurant data

##### Find the best parameters for the Model

In [18]:
#### NOISE
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_rest, y_noise)
# Print the best parameters
print("Best parameters for noise: " + str(search.best_params_))

#### SMELL
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_rest, y_smell)
print("Best parameters for smell: " + str(search.best_params_))

#### CLEAN
param_grid = {'max_depth': [1, 3, 5, 7], 'n_estimators' : [1, 3, 5, 7]}
search = GridSearchCV(RandomForestClassifier(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_rest, y_clean)
print("Best parameters for clean: " + str(search.best_params_))

Best parameters for noise: {'max_depth': 7, 'n_estimators': 5}
Best parameters for smell: {'max_depth': 1, 'n_estimators': 1}
Best parameters for clean: {'max_depth': 1, 'n_estimators': 3}


##### Train and Test with the real Model using Cross Validation

In [19]:
#### NOISE
rfc = RandomForestClassifier(max_depth=7, n_estimators=5)
# use cross validation
scores = cross_val_score(rfc, data_no_rest, y_noise, cv=ShuffleSplit(n_splits=5))
# print score
print("Score of noise prediction: " + str(np.mean(scores)))

#### SMELL
rfc = RandomForestClassifier(max_depth=1, n_estimators=1)
scores = cross_val_score(rfc, data_no_rest, y_smell, cv=ShuffleSplit(n_splits=5))
print("Score of smell prediction: " + str(np.mean(scores)))

#### CLEAN
rfc = RandomForestClassifier(max_depth=1, n_estimators=3)
scores = cross_val_score(rfc, data_no_rest, y_clean, cv=ShuffleSplit(n_splits=5))
print("Score of clean prediction: " + str(np.mean(scores)))

Score of noise prediction: 0.6461538461538462
Score of smell prediction: 0.9692307692307693
Score of clean prediction: 0.5230769230769231


##### Explanation on the results

### 2.1) For noise / smell / clean predictions using Support Vector Machine (SVM) --- All data

##### Find the best parameters for the Model

In [None]:
#### NOISE
param_grid = {}
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_noise)
# Print the best estimated parameters
print("Best parameters for noise: " + str(search.best_estimator_))

#### SMELL
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_smell)
print("Best parameters for smell: " + str(search.best_estimator_))

#### CLEAN
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data, y_clean)
print("Best parameters for clean: " + str(search.best_estimator_))

##### Train and Test with the real Model using Cross Validation

In [31]:
#### NOISE
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# use cross validation
scores = cross_val_score(svc, data, y_noise, cv=ShuffleSplit(n_splits=5))
# print score
print("Score of noise prediction: " + str(np.mean(scores)))

#### SMELL
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
scores = cross_val_score(svc, data, y_smell, cv=ShuffleSplit(n_splits=5))
print("Score of smell prediction: " + str(np.mean(scores)))

#### CLEAN
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
scores = cross_val_score(svc, data, y_clean, cv=ShuffleSplit(n_splits=5))
print("Score of clean prediction: " + str(np.mean(scores)))

Score of noise prediction: 0.7384615384615385
Score of smell prediction: 0.9538461538461538
Score of clean prediction: 0.5692307692307692


##### Explanation on the results

### 2.2) For noise / smell / clean predictions using Support Vector Machine (SVM) --- No taxi data

##### Find the best parameters for the Model

In [None]:
#### NOISE
param_grid = {}
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_taxi, y_noise)
# Print the best estimated parameters
print("Best parameters for noise: " + str(search.best_estimator_))

#### SMELL
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_taxi, y_smell)
print("Best parameters for smell: " + str(search.best_estimator_))

#### CLEAN
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_taxi, y_clean)
print("Best parameters for clean: " + str(search.best_estimator_))

##### Train and Test with the real Model using Cross Validation

In [33]:
#### NOISE
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# use cross validation
scores = cross_val_score(svc, data_no_taxi, y_noise, cv=ShuffleSplit(n_splits=5))
# print score
print("Score of noise prediction: " + str(np.mean(scores)))

#### SMELL
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
scores = cross_val_score(svc, data_no_taxi, y_smell, cv=ShuffleSplit(n_splits=5))
print("Score of smell prediction: " + str(np.mean(scores)))

#### CLEAN
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
scores = cross_val_score(svc, data_no_taxi, y_clean, cv=ShuffleSplit(n_splits=5))
print("Score of clean prediction: " + str(np.mean(scores)))

Score of noise prediction: 0.8
Score of smell prediction: 0.9230769230769231
Score of clean prediction: 0.7076923076923077


##### Explanation on the results

### 2.3) For noise / smell / clean predictions using Support Vector Machine (SVM) --- No restaurant data

##### Find the best parameters for the Model

In [None]:
#### NOISE
param_grid = {}
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_rest, y_noise)
# Print the best estimated parameters
print("Best parameters for noise: " + str(search.best_estimator_))

#### SMELL
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_rest, y_smell)
print("Best parameters for smell: " + str(search.best_estimator_))

#### CLEAN
search = GridSearchCV(SVC(), param_grid, cv=ShuffleSplit(n_splits=5))
search.fit(data_no_rest, y_clean)
print("Best parameters for clean: " + str(search.best_estimator_))

##### Train and Test with the real Model using Cross Validation

In [35]:
#### NOISE
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# use cross validation
scores = cross_val_score(svc, data_no_rest, y_noise, cv=ShuffleSplit(n_splits=5))
# print score
print("Score of noise prediction: " + str(np.mean(scores)))

#### SMELL
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
scores = cross_val_score(svc, data_no_rest, y_smell, cv=ShuffleSplit(n_splits=5))
print("Score of smell prediction: " + str(np.mean(scores)))

#### CLEAN
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
scores = cross_val_score(svc, data_no_rest, y_clean, cv=ShuffleSplit(n_splits=5))
print("Score of clean prediction: " + str(np.mean(scores)))

Score of noise prediction: 0.6923076923076923
Score of smell prediction: 0.9538461538461538
Score of clean prediction: 0.5692307692307693


##### Explanation on the results

## Datasets

## Model

## Jupyter Notebook

## DISCUSSION

##### Explanation of the choices

##### Understanding of the prediction

##### Recommandation for Shanghai city