In [144]:
from datetime import time

import pandas as pd
import numpy as np

from DataInterface.TrafficDataInterface import CityTrafficData, TrafficType
from DataInterface.GeoDataInterface import GeoData, GeoDataType
from DataInterface.AdminDataInterface import AdminData
from DataInterface.ElectionDataInterface import ElectionData
from Utils import City
from FeatureExtraction.SessionDistribution.SessionDistributionCalculator import SessionDistributionCalculator
from FeatureExtraction.IrisFeatureCalculator import IrisFeatureCalculator
from FeatureExtraction.ServiceConsumptionFeatureCalculator import ServiceConsumptionFeatureCalculator, ServiceConsumptionFeatureName
from FeatureExtraction.ElectionFeatureCalculator import ElectionFeatureCalculator, ElectionFeatureName
from FeatureSelection.Regression import Regression

In [145]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# DATA

In [4]:
traffic_data = CityTrafficData(city=City.PARIS, geo_data_type=GeoDataType.IRIS, traffic_type=TrafficType.USERS)

100%|██████████| 77/77 [01:13<00:00,  1.04it/s]


In [5]:
geo_data = GeoData()
geo_data.load(GeoDataType.IRIS)

In [8]:
admin_data = AdminData()

# SLEEP DATA ANALSIS

In [11]:
session_distribution_calculator = SessionDistributionCalculator(city_traffic_data=traffic_data, start=time(21, 30), end=time(3, 30))

In [12]:
session_distribution = session_distribution_calculator.calculate_session_distribution()

In [13]:
session_distribution.distribution_plot()

In [97]:
session_expectation = session_distribution.expectation_by_location()

In [21]:
session_expectation.geo_plot(geo_data=geo_data)

In [22]:
session_expectation.scatter_plot(confidence_intervals=True)

In [52]:
iris_subset = session_expectation.data.index.values

In [53]:
iris_feature_constructor = IrisFeatureCalculator(admin_data=admin_data, geo_data=geo_data)

### Variables considered

In [54]:
iris_centrality = iris_feature_constructor.centrality(subset=iris_subset).data

In [79]:
iris_density_of_city = iris_feature_constructor.density_of_city(subset=iris_subset).data

In [81]:
iris_pop_density = iris_feature_constructor.var_density(subset=iris_subset, var_names=['P19_POP']).rename(columns={'P19_POP': 'POP_DEN'})

In [82]:
iris_business_density = iris_feature_constructor.business_density(subset=iris_subset).data

In [83]:
age_shares = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_POP1529', 'P19_POP3044', 'P19_POP4559', 'P19_POP6074', 'P19_POP75P'])

In [84]:
lonely_shares = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_POP15P_PSEUL'])

In [85]:
income = admin_data.get_admin_data(subset=iris_subset)[['DEC_MED19', 'DEC_GI19']]

In [86]:
shares_of_workers_with_far_work = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_ACTOCC15P_ILT2P'])

In [87]:
education = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_ACT_DIPLMIN','P19_ACT_BAC', 'P19_ACT_SUP2'])

In [88]:
unemployment = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_CHOM1524', 'P19_CHOM2554', 'P19_CHOM5564'])

In [89]:
retirees = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_RETR1564'])

In [90]:
nationality = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_POP_ETR'])

In [91]:
house_sizes = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_RP_M30M2', 'P19_RP_3040M2', 'P19_RP_4060M2', 'P19_RP_6080M2', 'P19_RP_80100M2', 'P19_RP_100120M2', 'P19_RP_120M2P'])

In [92]:
men = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_POPH'])

In [93]:
families = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['P19_POP15P_MARIEE', 'C19_FAM'])

In [94]:
# services = feature_constructor.iris_shares(iris=iris_subset, var_names=['EQUIP_E107', 'EQUIP_E108', 'EQUIP_E109', 'EQUIP_A504'])

In [95]:
displacement_to_work = iris_feature_constructor.var_shares(subset=iris_subset, var_names=['C19_ACTOCC15P_MAR', 'C19_ACTOCC15P_TCOM', 'C19_ACTOCC15P_PAS', 'C19_ACTOCC15P_VELO', 'C19_ACTOCC15P_VOIT'])

In [98]:
session_expectation_data = session_expectation.data

In [99]:
data = pd.merge(iris_pop_density, session_expectation_data, left_index=True, right_index=True, how='inner')
data = pd.merge(data, iris_density_of_city, left_index=True, right_index=True, how='inner')
data = pd.merge(data, iris_centrality, left_index=True, right_index=True, how='inner')
data = pd.merge(data, income, left_index=True, right_index=True, how='inner')
data = pd.merge(data, iris_business_density + np.quantile(iris_business_density, 0.1), left_index=True, right_index=True, how='inner')
data = np.log(data)
data = pd.merge(data, age_shares, left_index=True, right_index=True, how='inner')
data = pd.merge(data, lonely_shares, left_index=True, right_index=True, how='inner')
data = pd.merge(data, shares_of_workers_with_far_work, left_index=True, right_index=True, how='inner')
data = pd.merge(data, education, left_index=True, right_index=True, how='inner')
data = pd.merge(data, unemployment, left_index=True, right_index=True, how='inner')
data = pd.merge(data, retirees, left_index=True, right_index=True, how='inner')
data = pd.merge(data, nationality, left_index=True, right_index=True, how='inner')
data = pd.merge(data, house_sizes, left_index=True, right_index=True, how='inner')
data = pd.merge(data, men, left_index=True, right_index=True, how='inner')
data = pd.merge(data, families, left_index=True, right_index=True, how='inner')
data = pd.merge(data, displacement_to_work, left_index=True, right_index=True, how='inner')
data.dropna(inplace=True)
data.shape, np.array(data.columns)

((2684, 39),
 array(['POP_DEN', 'session_expectation', 'std', 'n_obs',
        'density_of_city', 'centrality', 'DEC_MED19', 'DEC_GI19',
        'business_density', 'P19_POP1529', 'P19_POP3044', 'P19_POP4559',
        'P19_POP6074', 'P19_POP75P', 'P19_POP15P_PSEUL',
        'P19_ACTOCC15P_ILT2P', 'P19_ACT_DIPLMIN', 'P19_ACT_BAC',
        'P19_ACT_SUP2', 'P19_CHOM1524', 'P19_CHOM2554', 'P19_CHOM5564',
        'P19_RETR1564', 'P19_POP_ETR', 'P19_RP_M30M2', 'P19_RP_3040M2',
        'P19_RP_4060M2', 'P19_RP_6080M2', 'P19_RP_80100M2',
        'P19_RP_100120M2', 'P19_RP_120M2P', 'P19_POPH',
        'P19_POP15P_MARIEE', 'C19_FAM', 'C19_ACTOCC15P_MAR',
        'C19_ACTOCC15P_TCOM', 'C19_ACTOCC15P_PAS', 'C19_ACTOCC15P_VELO',
        'C19_ACTOCC15P_VOIT'], dtype=object))

In [100]:
x_axis = 'DEC_MED19'
features = data[[c for c in data.columns if c not in ['session_expectation', 'density_of_city']]]
labels = data['session_expectation'].to_frame()
regression = Regression(features=features['DEC_MED19'].to_frame(), labels=labels)
regression.plot(x_axis=x_axis, color=data['density_of_city'].to_frame())

# ELECTION DATA ANALYSIS

In [102]:
election_data = ElectionData()

In [105]:
service_consumption_by_location = traffic_data.get_service_consumption_by_location(start=time(18,30), end=time(23))

In [147]:
service_consumption_feature_calculator = ServiceConsumptionFeatureCalculator(service_consumption_by_location=service_consumption_by_location)

In [107]:
election_data_by_iris = election_data.get_election_data_table_iris_by_column(geo_data=geo_data, subset=traffic_data.data.coords[GeoDataType.IRIS.value].values, column='list_number', value='pct_votes_to_list_among_votes', aggregation_method='mean') / 100

In [113]:
election_feature_calculator = ElectionFeatureCalculator(election_result_by_location=election_data_by_iris)

In [139]:
entropy_election_result = election_feature_calculator.get_election_feature(feature=ElectionFeatureName.ENTROPY).data.rename(columns={'entropy': 'entropy_election_result'})

In [151]:
entropy_service_consumption = service_consumption_feature_calculator.get_consumption_feature(feature=ServiceConsumptionFeatureName.ENTROPY).data.rename(columns={'entropy': 'entropy_service'})

In [156]:
data_election = pd.merge(entropy_service_consumption, entropy_election_result, left_index=True, right_index=True, how='inner')
data_election.dropna(inplace=True)
data_election.shape, np.array(data_election.columns)

((2930, 2),
 array(['entropy_service', 'entropy_election_result'], dtype=object))

In [157]:
x_axis = 'entropy_service'
features = data_election['entropy_service'].to_frame()
labels = data_election['entropy_election_result'].to_frame()
regression = Regression(features=features, labels=labels)
regression.plot(x_axis=x_axis)