# Project - Predict the wellbeing of shanghainese communities

# Pre-modeling: prepare the dataset 

In [1]:
import pandas as pd
from matplotlib import pyplot
from shapely.geometry import Point, Polygon
from pandas.plotting import scatter_matrix
import numpy as np
import geopandas as gpd

## 1. Load target and clean target

In [3]:
df_target = pd.read_pickle("./dataset/TARGET-communities-extract-wellbeing.pkl")
header = df_target[0]
df_target = pd.DataFrame(data=df_target, columns=header)
df_target = df_target.drop(df_target.index[0])
df_target.head()

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude
1,万寿社区居委会,3,0.0,0.0,盈浦街道,青浦区,121.104287016932,31.1558823799679
2,万泰花园第一居委会,3,1.0,0.0,七宝镇,闵行区,121.351529089795,31.144326843509
3,万科阳光苑居委会,2,0.0,0.0,吴泾镇,闵行区,121.468520387505,31.0422853979528
4,万豪居委会,3,0.0,0.0,长征镇,普陀区,121.367237458941,31.2407251034807
5,万里名轩社区居委会,2,,0.0,万里街道,普陀区,121.403601417271,31.269155049366


cleanliness (1-4, 4 best), smell (0-1,1 worst),  noise (0-1, 1 worst)

In [4]:
len(df_target)

467

In [5]:
# Delete rows which don't have coordinates
df_target = df_target[(df_target['longitude'] != '') & (df_target['latitude'] != '')]
df_target.head()

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude
1,万寿社区居委会,3,0.0,0.0,盈浦街道,青浦区,121.104287016932,31.1558823799679
2,万泰花园第一居委会,3,1.0,0.0,七宝镇,闵行区,121.351529089795,31.144326843509
3,万科阳光苑居委会,2,0.0,0.0,吴泾镇,闵行区,121.468520387505,31.0422853979528
4,万豪居委会,3,0.0,0.0,长征镇,普陀区,121.367237458941,31.2407251034807
5,万里名轩社区居委会,2,,0.0,万里街道,普陀区,121.403601417271,31.269155049366


In [6]:
len(df_target)

461

### Clean, smell, noise, longitude, latitude are consider as object: convert them into number type

In [7]:
df_target.dtypes

             object
clean        object
smell        object
noise        object
town         object
district     object
longitude    object
latitude     object
dtype: object

In [8]:
df_target['clean'] = pd.to_numeric(df_target['clean'])
df_target['smell'] = pd.to_numeric(df_target['smell'])
df_target['noise'] = pd.to_numeric(df_target['noise'])
df_target['longitude'] = pd.to_numeric(df_target['longitude'])
df_target['latitude'] = pd.to_numeric(df_target['latitude'])

In [9]:
df_target.dtypes

              object
clean          int64
smell        float64
noise        float64
town          object
district      object
longitude    float64
latitude     float64
dtype: object

In [10]:
df_target['smell'] = df_target['smell'].apply(lambda x: 1 if x==0  else 0)
df_target['noise'] = df_target['noise'].apply(lambda x: 1 if x==0  else 0)
df_target.head()

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude
1,万寿社区居委会,3,1,1,盈浦街道,青浦区,121.104287,31.155882
2,万泰花园第一居委会,3,0,1,七宝镇,闵行区,121.351529,31.144327
3,万科阳光苑居委会,2,1,1,吴泾镇,闵行区,121.46852,31.042285
4,万豪居委会,3,1,1,长征镇,普陀区,121.367237,31.240725
5,万里名轩社区居委会,2,0,1,万里街道,普陀区,121.403601,31.269155


cleanliness (1-4, 4 best), smell (0-1,0 worst),  noise (0-1, 0 worst)

### Geometry point for each localization

In [11]:
geometry = [Point(xy) for xy in zip(df_target['longitude'], df_target['latitude'])]

In [12]:
df_target['geometry'] = geometry

In [13]:
df_target.head()

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude,geometry
1,万寿社区居委会,3,1,1,盈浦街道,青浦区,121.104287,31.155882,POINT (121.104287016932 31.1558823799679)
2,万泰花园第一居委会,3,0,1,七宝镇,闵行区,121.351529,31.144327,POINT (121.351529089795 31.144326843509)
3,万科阳光苑居委会,2,1,1,吴泾镇,闵行区,121.46852,31.042285,POINT (121.468520387505 31.0422853979528)
4,万豪居委会,3,1,1,长征镇,普陀区,121.367237,31.240725,POINT (121.367237458941 31.2407251034807)
5,万里名轩社区居委会,2,0,1,万里街道,普陀区,121.403601,31.269155,POINT (121.403601417271 31.269155049366)


## Target happiness: combinaison of clean, smell and noise

### 1. Same coefficient

In [15]:
df_target['happiness_equalCoff'] = (df_target['clean'] + df_target['noise'] + df_target['smell'])/3

### 2. Highlight clean

In [19]:
df_target['happiness_clean'] = (df_target['clean']*5 + df_target['noise'] + df_target['smell'])/7

### 3. Highlight smell

In [20]:
df_target['happiness_smell'] = (df_target['clean'] + df_target['noise'] + df_target['smell']*5)/7

### 4. Highlight noise

In [21]:
df_target['happiness_noise'] = (df_target['clean'] + df_target['noise']*5 + df_target['smell'])/7

### 5. Personnal opinion

In [22]:
df_target['happiness_perso'] = (df_target['clean']*3 + df_target['noise']*2 + df_target['smell']*5)/10

### 6. Other

In [23]:
df_target['happiness_other'] = (df_target['clean']*2 + df_target['noise']*5 + df_target['smell']*2)/9

In [184]:
df_target.groupby('clean').size()

clean
1     15
2    194
3    232
4     20
dtype: int64

In [185]:
df_target[df_target['clean'] == 1]

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude,geometry,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
7,三乐里居委会,1,0,1,江宁路街道,静安区,121.446436,31.240792,POINT (121.446436165896 31.2407917918077),0.666667,0.857143,0.285714,0.857143,0.5,0.777778
44,中联村居委会,1,0,1,定海路街道,杨浦区,121.548013,31.272667,POINT (121.548012832826 31.2726669298349),0.666667,0.857143,0.285714,0.857143,0.5,0.777778
63,佳木斯路三一五弄居委会,1,1,1,五角场镇,杨浦区,121.534569,31.297853,POINT (121.534568863062 31.297852523775),1.0,1.0,1.0,1.0,1.0,1.0
64,佳木斯路居委会,1,0,1,五角场镇,杨浦区,121.538516,31.303131,POINT (121.538516260017 31.3031306277762),0.666667,0.857143,0.285714,0.857143,0.5,0.777778
80,凌兆新村第九居委会,1,1,1,东明路街道,浦东新区,121.492404,31.147694,POINT (121.492403679486 31.1476944160644),1.0,1.0,1.0,1.0,1.0,1.0
166,娄塘社区居委会,1,1,0,徐行镇,嘉定区,121.211118,31.430224,POINT (121.211117730099 31.4302241638141),0.666667,0.857143,0.857143,0.285714,0.8,0.444444
195,府谷居委会,1,0,0,小东门街道,黄浦区,121.495211,31.216259,POINT (121.495211308954 31.2162589914593),0.333333,0.714286,0.142857,0.142857,0.3,0.222222
215,新南社区居委会,1,1,0,西渡街道,奉贤区,121.441535,30.99342,POINT (121.441535245655 30.9934202309589),0.666667,0.857143,0.857143,0.285714,0.8,0.444444
218,新宝社区居委会,1,1,1,嘉定工业区,嘉定区,121.19183,31.404177,POINT (121.191829968378 31.4041773341167),1.0,1.0,1.0,1.0,1.0,1.0
244,春申居委会,1,1,0,新桥镇,松江区,121.35072,31.075353,POINT (121.350720170299 31.0753533652656),0.666667,0.857143,0.857143,0.285714,0.8,0.444444


In [24]:
df_target.head()

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude,geometry,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
1,万寿社区居委会,3,1,1,盈浦街道,青浦区,121.104287,31.155882,POINT (121.104287016932 31.1558823799679),1.666667,2.428571,1.285714,1.285714,1.6,1.444444
2,万泰花园第一居委会,3,0,1,七宝镇,闵行区,121.351529,31.144327,POINT (121.351529089795 31.144326843509),1.333333,2.285714,0.571429,1.142857,1.1,1.222222
3,万科阳光苑居委会,2,1,1,吴泾镇,闵行区,121.46852,31.042285,POINT (121.468520387505 31.0422853979528),1.333333,1.714286,1.142857,1.142857,1.3,1.222222
4,万豪居委会,3,1,1,长征镇,普陀区,121.367237,31.240725,POINT (121.367237458941 31.2407251034807),1.666667,2.428571,1.285714,1.285714,1.6,1.444444
5,万里名轩社区居委会,2,0,1,万里街道,普陀区,121.403601,31.269155,POINT (121.403601417271 31.269155049366),1.0,1.571429,0.428571,1.0,0.8,1.0


In [30]:
df_target.describe()

Unnamed: 0,clean,smell,noise,longitude,latitude,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
count,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0
mean,2.557484,0.819957,0.785249,121.443524,31.213527,1.387563,2.056089,1.063217,1.043384,1.334273,1.186792
std,0.631986,0.384641,0.411095,0.113162,0.129333,0.288591,0.461371,0.298898,0.316192,0.288016,0.28733
min,1.0,0.0,0.0,121.089976,30.736401,0.333333,0.714286,0.142857,0.142857,0.3,0.222222
25%,2.0,1.0,1.0,121.388639,31.163585,1.333333,1.714286,1.0,1.0,1.1,1.0
50%,3.0,1.0,1.0,121.448259,31.228257,1.333333,2.285714,1.142857,1.142857,1.3,1.222222
75%,3.0,1.0,1.0,121.504818,31.281153,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
max,4.0,1.0,1.0,121.892089,31.629071,2.0,3.142857,1.428571,1.428571,1.9,1.666667


## Check if point of target is in the discrict

In [26]:
df_district = pd.read_pickle("./dataset/df_district.pkl")
df_district.head()

Unnamed: 0,NAME,TYPE,geometry,surface
0,??,2,"POLYGON Z ((121.42168 30.68471 0.00000, 121.42...",0.080387
1,???,2,"POLYGON Z ((121.41199 30.69374 0.00000, 121.41...",0.374191
2,???,2,"POLYGON Z ((121.40163 30.70717 0.00000, 121.40...",0.08239
3,????,2,"POLYGON Z ((121.33687 30.73806 0.00000, 121.33...",19.301126
4,????,2,"POLYGON Z ((121.25143 30.79177 0.00000, 121.25...",55.066247


In [59]:
df_model = pd.DataFrame.copy(df_district[['geometry', 'surface']])

In [60]:
df_model['nb_target'] = 0
df_model['happiness_equalCoff'] = 0
df_model['happiness_clean'] = 0
df_model['happiness_smell'] = 0
df_model['happiness_noise'] = 0
df_model['happiness_perso'] = 0
df_model['happiness_other'] = 0

In [61]:
for index, polygon in df_district['geometry'].items():
    sum_equal = 0
    sum_clean = 0
    sum_smell = 0
    sum_noise = 0
    sum_perso = 0
    sum_other = 0
    for equal, clean, smell, noise, perso, other, point in zip(df_target['happiness_equalCoff'], df_target['happiness_clean'], df_target['happiness_smell'], df_target['happiness_noise'], df_target['happiness_perso'], df_target['happiness_other'], df_target['geometry']):
        if point.within(polygon) == True:
            sum_equal += equal
            sum_clean += clean
            sum_smell += smell
            sum_noise += noise
            sum_perso += perso
            sum_other += other
            df_model.loc[index, 'nb_target'] += 1
    if df_model.loc[index, 'nb_target'] > 0 : 
        df_model.loc[index, 'happiness_equalCoff'] = sum_equal/df_model.loc[index, 'nb_target']
        df_model.loc[index, 'happiness_clean'] = sum_clean/df_model.loc[index, 'nb_target'] 
        df_model.loc[index, 'happiness_smell'] = sum_smell/df_model.loc[index, 'nb_target'] 
        df_model.loc[index, 'happiness_noise'] = sum_noise/df_model.loc[index, 'nb_target'] 
        df_model.loc[index, 'happiness_perso'] = sum_perso/df_model.loc[index, 'nb_target']
        df_model.loc[index, 'happiness_other'] = sum_other/df_model.loc[index, 'nb_target'] 

In [62]:
df_model.head(10)

Unnamed: 0,geometry,surface,nb_target,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
0,"POLYGON Z ((121.42168 30.68471 0.00000, 121.42...",0.080387,0,0.0,0.0,0.0,0.0,0.0,0.0
1,"POLYGON Z ((121.41199 30.69374 0.00000, 121.41...",0.374191,0,0.0,0.0,0.0,0.0,0.0,0.0
2,"POLYGON Z ((121.40163 30.70717 0.00000, 121.40...",0.08239,0,0.0,0.0,0.0,0.0,0.0,0.0
3,"POLYGON Z ((121.33687 30.73806 0.00000, 121.33...",19.301126,0,0.0,0.0,0.0,0.0,0.0,0.0
4,"POLYGON Z ((121.25143 30.79177 0.00000, 121.25...",55.066247,0,0.0,0.0,0.0,0.0,0.0,0.0
5,"POLYGON Z ((121.36179 30.80039 0.00000, 121.36...",43.863191,1,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
6,"POLYGON Z ((121.19465 30.81453 0.00000, 121.19...",46.804458,0,0.0,0.0,0.0,0.0,0.0,0.0
7,"POLYGON Z ((121.29682 30.81891 0.00000, 121.29...",34.93048,3,1.333333,2.095238,0.952381,0.952381,1.266667,1.111111
8,"POLYGON Z ((121.36713 30.84311 0.00000, 121.36...",58.785377,2,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
9,"POLYGON Z ((121.17453 30.87610 0.00000, 121.17...",59.495413,0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
df_model_keep = df_model[df_model['nb_target']>0]
df_model_drop = df_model[df_model['nb_target']<=0]

In [64]:
print(len(df_model_keep))
print(len(df_model_drop))

129
137


In [65]:
df_model = df_model_keep

In [66]:
df_model = df_model.reset_index()

In [67]:
df_model.head()

Unnamed: 0,index,geometry,surface,nb_target,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
0,5,"POLYGON Z ((121.36179 30.80039 0.00000, 121.36...",43.863191,1,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
1,7,"POLYGON Z ((121.29682 30.81891 0.00000, 121.29...",34.93048,3,1.333333,2.095238,0.952381,0.952381,1.266667,1.111111
2,8,"POLYGON Z ((121.36713 30.84311 0.00000, 121.36...",58.785377,2,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
3,10,"POLYGON Z ((121.76851 30.88156 0.00000, 121.76...",105.243132,1,1.333333,1.714286,1.142857,1.142857,1.3,1.222222
4,11,"POLYGON Z ((121.50531 30.88954 0.00000, 121.50...",95.484156,1,1.0,1.571429,1.0,0.428571,1.1,0.666667


In [72]:
del df_model['index']

### Save the model

In [75]:
df_model.to_pickle("./df_model.pickle")

# 2. Collecting the features

## POI

In [69]:
df_poi = pd.read_pickle("./variables/poi_per_district.pkl")
df_poi.head()

Unnamed: 0,geometry,nb_transportation,nb_shopping,nb_restaurant,nb_scenicSpot,nb_stadiumAndGym
0,"POLYGON Z ((121.42168 30.68471 0.00000, 121.42...",0,0,0,0,0
1,"POLYGON Z ((121.41199 30.69374 0.00000, 121.41...",0,0,0,0,0
2,"POLYGON Z ((121.40163 30.70717 0.00000, 121.40...",0,0,0,0,0
3,"POLYGON Z ((121.33687 30.73806 0.00000, 121.33...",1087,1789,748,28,32
4,"POLYGON Z ((121.25143 30.79177 0.00000, 121.25...",1428,1762,616,9,12


Merge df_poi with df_model to keep only district with target point

In [76]:
df_features = pd.merge(df_poi, df_model, on='geometry')

In [77]:
df_features.head()

Unnamed: 0,geometry,nb_transportation,nb_shopping,nb_restaurant,nb_scenicSpot,nb_stadiumAndGym,surface,nb_target,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
0,"POLYGON Z ((121.36179 30.80039 0.00000, 121.36...",1622,2152,1131,14,28,43.863191,1,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
1,"POLYGON Z ((121.29682 30.81891 0.00000, 121.29...",835,374,177,12,7,34.93048,3,1.333333,2.095238,0.952381,0.952381,1.266667,1.111111
2,"POLYGON Z ((121.36713 30.84311 0.00000, 121.36...",767,346,157,2,3,58.785377,2,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
3,"POLYGON Z ((121.76851 30.88156 0.00000, 121.76...",1267,529,1030,67,38,105.243132,1,1.333333,1.714286,1.142857,1.142857,1.3,1.222222
4,"POLYGON Z ((121.50531 30.88954 0.00000, 121.50...",1235,747,398,11,11,95.484156,1,1.0,1.571429,1.0,0.428571,1.1,0.666667


In [87]:
df_features_proportion = pd.DataFrame.copy(df_features[['geometry', 'surface']])

In [88]:
df_features_proportion['nb_transportation_prop'] = df_features.apply(lambda row: row.nb_transportation/row.surface, axis=1)
df_features_proportion['nb_shopping_prop'] = df_features.apply(lambda row: row.nb_shopping/row.surface, axis=1)
df_features_proportion['nb_restaurant_prop'] = df_features.apply(lambda row: row.nb_restaurant/row.surface, axis=1)
df_features_proportion['nb_scenicSpot_prop'] = df_features.apply(lambda row: row.nb_scenicSpot/row.surface, axis=1)
df_features_proportion['nb_stadiumAndGym_prop'] = df_features.apply(lambda row: row.nb_stadiumAndGym/row.surface, axis=1)

In [89]:
df_features_proportion

Unnamed: 0,geometry,surface,nb_transportation_prop,nb_shopping_prop,nb_restaurant_prop,nb_scenicSpot_prop,nb_stadiumAndGym_prop
0,"POLYGON Z ((121.36179 30.80039 0.00000, 121.36...",43.863191,36.978614,49.061638,25.784718,0.319174,0.638348
1,"POLYGON Z ((121.29682 30.81891 0.00000, 121.29...",34.930480,23.904625,10.706982,5.067208,0.343540,0.200398
2,"POLYGON Z ((121.36713 30.84311 0.00000, 121.36...",58.785377,13.047462,5.885817,2.670732,0.034022,0.051033
3,"POLYGON Z ((121.76851 30.88156 0.00000, 121.76...",105.243132,12.038790,5.026456,9.786862,0.636621,0.361069
4,"POLYGON Z ((121.50531 30.88954 0.00000, 121.50...",95.484156,12.934083,7.823287,4.168231,0.115202,0.115202
...,...,...,...,...,...,...,...
124,"POLYGON Z ((121.38541 31.40271 0.00000, 121.38...",40.414199,42.856225,53.817718,27.812007,0.470132,0.692826
125,"POLYGON Z ((121.21409 31.47733 0.00000, 121.21...",104.731772,24.061466,15.019320,9.586394,0.133675,0.076386
126,"POLYGON Z ((121.70336 31.53607 0.00000, 121.70...",56.459488,20.280028,14.612247,4.286259,0.141695,0.035424
127,"POLYGON Z ((121.63335 31.63086 0.00000, 121.63...",55.313580,12.329703,3.615749,0.668913,0.018079,0.000000


In [90]:
df_features_proportion.describe()

Unnamed: 0,surface,nb_transportation_prop,nb_shopping_prop,nb_restaurant_prop,nb_scenicSpot_prop,nb_stadiumAndGym_prop
count,129.0,129.0,129.0,129.0,129.0,129.0
mean,26.371432,159.332327,323.116293,212.9638,5.637502,7.969891
std,30.643329,162.227566,459.794018,260.331434,13.152526,9.845168
min,0.248091,10.44679,3.388763,0.668913,0.0,0.0
25%,3.266643,37.658516,47.551865,29.842439,0.392347,0.736343
50%,10.990082,85.489725,168.817467,110.288018,1.337835,3.786046
75%,39.461297,243.677076,437.143574,304.575599,4.315839,11.710965
max,138.522432,663.899651,2721.061981,1230.698177,92.470745,40.304127


In [99]:
df_features_proportion

Unnamed: 0,geometry,surface,nb_transportation_prop,nb_shopping_prop,nb_restaurant_prop,nb_scenicSpot_prop,nb_stadiumAndGym_prop
0,"POLYGON Z ((121.36179 30.80039 0.00000, 121.36...",43.863191,36.978614,49.061638,25.784718,0.319174,0.638348
1,"POLYGON Z ((121.29682 30.81891 0.00000, 121.29...",34.930480,23.904625,10.706982,5.067208,0.343540,0.200398
2,"POLYGON Z ((121.36713 30.84311 0.00000, 121.36...",58.785377,13.047462,5.885817,2.670732,0.034022,0.051033
3,"POLYGON Z ((121.76851 30.88156 0.00000, 121.76...",105.243132,12.038790,5.026456,9.786862,0.636621,0.361069
4,"POLYGON Z ((121.50531 30.88954 0.00000, 121.50...",95.484156,12.934083,7.823287,4.168231,0.115202,0.115202
...,...,...,...,...,...,...,...
124,"POLYGON Z ((121.38541 31.40271 0.00000, 121.38...",40.414199,42.856225,53.817718,27.812007,0.470132,0.692826
125,"POLYGON Z ((121.21409 31.47733 0.00000, 121.21...",104.731772,24.061466,15.019320,9.586394,0.133675,0.076386
126,"POLYGON Z ((121.70336 31.53607 0.00000, 121.70...",56.459488,20.280028,14.612247,4.286259,0.141695,0.035424
127,"POLYGON Z ((121.63335 31.63086 0.00000, 121.63...",55.313580,12.329703,3.615749,0.668913,0.018079,0.000000


## Green Space

In [135]:
df_greenspace = pd.read_pickle("./variables/greenspaces_per_district.pickle")
df_greenspace

Unnamed: 0,new_geometry,green_area,green_area_proportion
0,"POLYGON ((121.44356 31.00937, 121.44338 31.009...",0.000076,0.026719
1,"POLYGON ((121.38734 31.03991, 121.38699 31.040...",0.000001,0.000759
2,"POLYGON ((121.42360 31.07339, 121.42303 31.073...",0.000004,0.001591
3,"POLYGON ((121.45789 31.10380, 121.45879 31.101...",0.000072,0.020270
4,"POLYGON ((121.46062 31.11422, 121.46045 31.113...",0.000058,0.083765
...,...,...,...
116,"POLYGON ((121.47736 31.38147, 121.47782 31.380...",0.000328,0.087659
117,"POLYGON ((121.50581 31.39805, 121.50581 31.398...",0.000083,0.099504
118,"POLYGON ((121.38541 31.40271, 121.38550 31.402...",0.000228,0.059405
119,"POLYGON ((121.47005 31.41888, 121.47017 31.418...",0.000043,0.010184


In [113]:
df_greenspace = df_greenspace.rename(columns={"new_geometry": "geometry"})

In [118]:
dict_greenspace = df_greenspace.to_dict()

Add green space only for district where target point

In [145]:
for index, row in df_features_proportion.iterrows():
    district = row.geometry
    for index2, row2 in df_greenspace.iterrows():
        greenspace = row2.new_geometry
        if greenspace.within(district):
            df_features_proportion.loc[index,'green_space_prop'] = row2.green_area_proportion

In [147]:
df_features_proportion['green_space_prop'].unique()

array([0.05940506, 0.02671868, 0.00075854, 0.0202701 , 0.0837652 ,
       0.06476771, 0.04133348, 0.13428385, 0.00335503, 0.00810872,
       0.10335767, 0.02555195, 0.10193514, 0.06165221, 0.02313382,
       0.02162003, 0.01373103, 0.06099939, 0.00616982, 0.06262897,
       0.02209065, 0.02270185, 0.03321141, 0.01116084, 0.04238992,
       0.02736485, 0.14818171, 0.02374664, 0.04728594, 0.1334503 ,
       0.06891046, 0.0331211 , 0.00201168, 0.03661703, 0.11267493,
       0.11257117, 0.19405408, 0.00348518, 0.11837304, 0.00891548,
       0.00632928, 0.04146645, 0.01357381, 0.0280034 , 0.00044925,
       0.00126697, 0.01197261, 0.00134831, 0.07210172, 0.02306575,
       0.00850668, 0.09172931, 0.03197232, 0.03110707, 0.02085812,
       0.06254409, 0.06417438, 0.12752838, 0.00260702, 0.04491543,
       0.06922475, 0.00138085, 0.03633956, 0.02830929, 0.02743967,
       0.03516942, 0.01900143, 0.06374445, 0.00822663, 0.00137135,
       0.03604528, 0.07331035, 0.00749972, 0.07768961, 0.01341

## Mobike 

In [149]:
df_mobike = pd.read_pickle("./variables/mobike_per_district.pickle")
df_mobike.head()

Unnamed: 0,geometry,nb_mobike,nb_mobike_proportion
0,"POLYGON Z ((121.42168 30.68471 0.00000, 121.42...",0,0.0
1,"POLYGON Z ((121.41199 30.69374 0.00000, 121.41...",0,0.0
2,"POLYGON Z ((121.40163 30.70717 0.00000, 121.40...",0,0.0
3,"POLYGON Z ((121.33687 30.73806 0.00000, 121.33...",0,0.0
4,"POLYGON Z ((121.25143 30.79177 0.00000, 121.25...",0,0.0


In [152]:
for index, row in df_features_proportion.iterrows():
    district = row.geometry
    for index2, row2 in df_mobike.iterrows():
        mobike = row2.geometry
        if district == mobike:
            df_features_proportion.loc[index,'nb_mobike_prop'] = row2.nb_mobike_proportion

In [153]:
df_features_proportion.head()

Unnamed: 0,geometry,surface,nb_transportation_prop,nb_shopping_prop,nb_restaurant_prop,nb_scenicSpot_prop,nb_stadiumAndGym_prop,green_space_prop,nb_mobike_prop
0,"POLYGON Z ((121.36179 30.80039 0.00000, 121.36...",43.863191,36.978614,49.061638,25.784718,0.319174,0.638348,0.059405,0.0
1,"POLYGON Z ((121.29682 30.81891 0.00000, 121.29...",34.93048,23.904625,10.706982,5.067208,0.34354,0.200398,0.059405,0.0
2,"POLYGON Z ((121.36713 30.84311 0.00000, 121.36...",58.785377,13.047462,5.885817,2.670732,0.034022,0.051033,0.059405,0.034022
3,"POLYGON Z ((121.76851 30.88156 0.00000, 121.76...",105.243132,12.03879,5.026456,9.786862,0.636621,0.361069,0.059405,0.199538
4,"POLYGON Z ((121.50531 30.88954 0.00000, 121.50...",95.484156,12.934083,7.823287,4.168231,0.115202,0.115202,0.059405,0.104729


# Clean features

In [162]:
# Drop row where all the features are null
columns = ['nb_mobike_prop', 'nb_transportation_prop', 'nb_shopping_prop', 'nb_restaurant_prop', 'nb_scenicSpot_prop', 'nb_stadiumAndGym_prop', 'green_space_prop']
df_features_proportion = df_features_proportion.replace(0, pd.np.nan).dropna(axis=0, how='all', subset=columns)

In [163]:
df_features_proportion[columns] = df_features_proportion[columns].fillna(0).astype(float)

In [165]:
df_features_proportion.to_pickle("./variables/df_features.pickle")

# 3. Model - Final dataset ready for prediction

In [171]:
df_model = pd.merge(df_features_proportion, df_model, on='geometry')

In [173]:
df_model.head()

Unnamed: 0,geometry,nb_transportation_prop,nb_shopping_prop,nb_restaurant_prop,nb_scenicSpot_prop,nb_stadiumAndGym_prop,green_space_prop,nb_mobike_prop,happiness_equalCoff,happiness_clean,happiness_smell,happiness_noise,happiness_perso,happiness_other
0,"POLYGON Z ((121.36179 30.80039 0.00000, 121.36...",36.978614,49.061638,25.784718,0.319174,0.638348,0.059405,0.0,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
1,"POLYGON Z ((121.29682 30.81891 0.00000, 121.29...",23.904625,10.706982,5.067208,0.34354,0.200398,0.059405,0.0,1.333333,2.095238,0.952381,0.952381,1.266667,1.111111
2,"POLYGON Z ((121.36713 30.84311 0.00000, 121.36...",13.047462,5.885817,2.670732,0.034022,0.051033,0.059405,0.034022,1.666667,2.428571,1.285714,1.285714,1.6,1.444444
3,"POLYGON Z ((121.76851 30.88156 0.00000, 121.76...",12.03879,5.026456,9.786862,0.636621,0.361069,0.059405,0.199538,1.333333,1.714286,1.142857,1.142857,1.3,1.222222
4,"POLYGON Z ((121.50531 30.88954 0.00000, 121.50...",12.934083,7.823287,4.168231,0.115202,0.115202,0.059405,0.104729,1.0,1.571429,1.0,0.428571,1.1,0.666667


In [226]:
df_model.shape

(129, 11)

129 observations and 7 features

In [174]:
df_model.to_pickle("./df_model.pickle")

## Scale features

In [175]:
df_model.columns

Index(['geometry', 'nb_transportation_prop', 'nb_shopping_prop',
       'nb_restaurant_prop', 'nb_scenicSpot_prop', 'nb_stadiumAndGym_prop',
       'green_space_prop', 'nb_mobike_prop', 'happiness_equalCoff',
       'happiness_clean', 'happiness_smell', 'happiness_noise',
       'happiness_perso', 'happiness_other'],
      dtype='object')

In [177]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #scaler like 'centré réduit'
# Fit: select some columns, it's expecting more than one column so u need to write like this [[]]
df_model['nb_transportation_scaled'] = scaler.fit_transform(df_model[['nb_transportation_prop']])
df_model['nb_shopping_scaled'] = scaler.fit_transform(df_model[['nb_shopping_prop']])
df_model['nb_restaurant_scaled'] = scaler.fit_transform(df_model[['nb_restaurant_prop']])
df_model['nb_scenicSpot_scaled'] = scaler.fit_transform(df_model[['nb_scenicSpot_prop']])
df_model['nb_stadiumAndGym_scaled'] = scaler.fit_transform(df_model[['nb_stadiumAndGym_prop']])
df_model['nb_mobike_scaled'] = scaler.fit_transform(df_model[['nb_mobike_prop']])
df_model['green_space_scaled'] = scaler.fit_transform(df_model[['green_space_prop']])

In [178]:
print(scaler.mean_)

[0.05048683]


In [181]:
df_predict = df_model[['nb_transportation_scaled', 'nb_shopping_scaled', 'nb_restaurant_scaled', 'nb_scenicSpot_scaled', 
                       'nb_stadiumAndGym_scaled', 'nb_mobike_scaled', 'green_space_scaled', 'happiness_equalCoff',
                       'happiness_clean', 'happiness_smell', 'happiness_noise',
                       'happiness_perso', 'happiness_other']]

In [182]:
df_predict.to_pickle("df_prediction.pkl")