In [135]:
import os
import mlflow
import numpy as np
import pandas as pd

from math import sqrt
from joblib import delayed
from joblib import Parallel

from warnings import catch_warnings
from warnings import filterwarnings
from multiprocessing import cpu_count
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from matplotlib import pyplot
import matplotlib.dates as mdates

from pathlib import Path
from functools import reduce
from datetime import datetime

import random
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression, RidgeCV, Ridge, LogisticRegression
from regressors import stats
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

Upload unemployment rate

In [136]:
df = pd.read_csv('./../../final_dfs/for_modeling/df_final_date_wide_2007.csv', converters={'ags2': str, 'ags5': str} )
df.set_index('ags5', drop=True, inplace=True)
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df = df[df['date'] >= '2007-05-01']
df = df[['date','unemployment_rate']]
df

Unnamed: 0_level_0,date,unemployment_rate
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,2007-05-01,12.7
1001,2007-06-01,12.2
1001,2007-07-01,12.5
1001,2007-08-01,12.3
1001,2007-09-01,11.7
...,...,...
16073,2021-05-01,6.0
16074,2021-05-01,4.6
16075,2021-05-01,4.8
16076,2021-05-01,4.9


load structural variables 

In [205]:
df_stuctural = pd.read_csv('./../../final_dfs/for_modeling/df_final_stationery.csv', converters={'ags2': str, 'ags5': str} )
df_stuctural.set_index('ags5', drop=True, inplace=True)
df_stuctural = df_stuctural.drop(['grw_funding_framework',"Unnamed:_0",'support_area_status','debtor_quota'], axis=1)
df_stuctural['urban_/_rural'] = df_stuctural['urban_/_rural'] - 1
df_stuctural['east_west'] = df_stuctural['east_west'] - 1

In [193]:
list(df_stuctural.columns)

['cluster',
 'kreis',
 'ags2',
 'supermarkets_population',
 'supermarkets_average_distance',
 'public_transport_availability',
 'average_distance_bus_stop',
 'average_distance_train_station',
 'average_distance_public_transport',
 'number_of_students',
 'number_of_hospitals',
 'number_of_hospital_beds',
 'number_of_hospital_beds_adj',
 'hospital_patiants',
 'households_of_1_person',
 'households_of_2_person',
 'households_of_3_person',
 'households_of_4_person',
 'households_of_5_person_or_more',
 'household_with_kids',
 'household_with_kids_under_3',
 'household_with_kids_over_3_under_6',
 'household_with_kids_over_6_under_10',
 'household_with_kids_over_10_under_15',
 'household_with_kids_over_15_under_18',
 'household_with_double_income_no_kids',
 'car_density',
 'no_of_paths_per_person_and_day',
 'kilometers_per_person_and_day',
 '_percentage_out_of_home',
 'share_of_journeys_on_foot',
 'share_of_journeys_on_bike',
 'proportion_of_motorised_vehicle_passenger',
 'share_of_motorised_

## ATE for binary columns

In [194]:
def ATE_IPTW(df_stuctural, df, column):
    #first, caculate PS 
    X = df_stuctural.drop(['kreis',column], axis=1)
    Y = df_stuctural[column].astype(str)
        
    #dummy the categorical vars in X 
    dummies = ['east_west', 'eligible_area', 'urban_/_rural']
    if column in dummies:
        dummies.remove(column)
    for dummy in dummies:
        X[dummy] = X[dummy].astype(str)
    X['cluster'] = X['cluster'].astype(str)

    X = pd.get_dummies(data=X, drop_first=True)
    Y = pd.get_dummies(data=Y, drop_first=True)

    regr = LogisticRegression(random_state=0, max_iter=10000)
    regr.fit(X, Y, sample_weight=None)
    coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(regr.coef_))], axis = 1)
    coefficients.columns = ['variable', 'coef']
    coefficients = coefficients.sort_values(by=['coef'])
    print(coefficients)
    pred = regr.predict_proba(X)[:, 1]
    df_stuctural['pred'] = pred.tolist()
    print(pred)
    
    df_mixed = pd.merge(df, df_stuctural, left_index=True, right_index=True)

    Y_1 = np.mean(df_mixed[column]*df_mixed['unemployment_rate']/df_mixed['pred'])
    Y_0 = np.mean((1-df_mixed[column])*df_mixed['unemployment_rate']/(1-df_mixed['pred']))
    return Y_1 - Y_0

In [195]:
 ATE_IPTW(df_stuctural, df, 'eligible_area')

  return f(*args, **kwargs)


                                      variable      coef
99      number_of_companies_arts_entertainment -0.011984
112                                no_of_clubs -0.011830
46                    2018_population_15_to_18 -0.010266
49                    2018_population_25_to_30 -0.009818
35                    municipal_tax_per_capita -0.009812
..                                         ...       ...
88   number_of_companies_repair_motor_vehicles  0.009956
117                       no_of_tourism_points  0.010136
20        household_with_kids_over_10_under_15  0.011993
50                    2018_population_30_to_35  0.015538
21        household_with_kids_over_15_under_18  0.022641

[187 rows x 2 columns]
[8.97476284e-001 9.99906211e-001 1.00000000e+000 9.99867215e-001
 9.99950955e-001 7.32492381e-001 9.99999947e-001 9.99998971e-001
 8.26946740e-001 9.99308740e-001 9.83227894e-001 9.98840139e-001
 2.96495740e-001 9.99520530e-001 6.03737174e-007 5.64461337e-119
 1.07958467e-002 1.56821422e-001

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


8.576648405345228

In [160]:
 ATE_IPTW(df_stuctural, df, 'east_west')

                        variable      coef
65         no_of_foreigners_2018 -0.004596
48      2018_population_20_to_25 -0.004028
61      2019_population_15_to_35 -0.003719
12        households_of_2_person -0.003403
78                number_of_beds -0.002456
..                           ...       ...
154  household_in_income_calss_2  0.001886
11        households_of_1_person  0.002074
62      2019_population_35_to_60  0.002281
51      2018_population_35_to_40  0.003487
50      2018_population_30_to_35  0.004234

[197 rows x 2 columns]
[3.54633371e-023 5.47666237e-031 5.28941821e-007 1.87804411e-012
 1.50824775e-034 1.92250230e-027 7.10666996e-082 3.37732791e-040
 1.15890909e-063 4.40377512e-020 3.48996782e-031 1.52226716e-023
 2.64691290e-045 2.92341443e-021 1.05815107e-043 3.74612965e-181
 1.35207024e-012 2.84247294e-025 3.61192385e-015 6.63638676e-021
 1.27195403e-016 3.68692380e-016 1.60471096e-007 9.59496925e-016
 1.92701942e-011 1.54750618e-029 1.40745052e-105 5.61023557e-038
 3.51

  return f(*args, **kwargs)


7.7274163257985125

In [161]:
 ATE_IPTW(df_stuctural, df, 'urban_/_rural')

  return f(*args, **kwargs)


                                  variable      coef
87        number_of_companies_construction -0.023412
91       number_of_companies_communication -0.023001
84       number_of_companies_manufacturing -0.021572
19     household_with_kids_over_6_under_10 -0.019404
23                             car_density -0.017803
..                                     ...       ...
117                   no_of_tourism_points  0.023987
47                2018_population_18_to_20  0.026298
82         number_of_companies_agriculture  0.027081
32   public_transport_per_1000_inhabitants  0.040370
109                         no_of_branches  0.046682

[197 rows x 2 columns]
[9.96484745e-001 6.62303711e-005 1.79610987e-008 9.94983993e-001
 1.00000000e+000 1.00000000e+000 1.00000000e+000 9.91365069e-001
 1.57659861e-005 9.99999953e-001 8.28894083e-001 1.00000000e+000
 9.37816385e-001 9.99999974e-001 3.24288996e-006 2.75366464e-172
 2.41026199e-026 2.77047566e-001 6.33155375e-006 1.00000000e+000
 8.94519946e-00

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3.782704038993835

In [182]:
def perfct_sep(df_stuctural, df, column):
    #first, caculate PS 
    X = df_stuctural.drop(['kreis',column], axis=1)
    Y = df_stuctural[column].astype(str)
        
    #dummy the categorical vars in X 
    dummies = ['east_west', 'eligible_area', 'urban_/_rural']
    if column in dummies:
        dummies.remove(column)
    for dummy in dummies:
        X[dummy] = X[dummy].astype(str)
    X['cluster'] = X['cluster'].astype(str)

    X = pd.get_dummies(data=X, drop_first=True)
    Y = pd.get_dummies(data=Y, drop_first=True)

    regr = LogisticRegression(random_state=0, max_iter=10000)
    regr.fit(X, Y, sample_weight=None)
    coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(regr.coef_))], axis = 1)
    coefficients.columns = ['variable', 'coef']
    coefficients = coefficients.sort_values(by=['coef'])
    pred = regr.predict_proba(X)[:, 1]
    df_stuctural['pred'] = pred.tolist()
     
    df_stuctural['diff'] = df_stuctural[column] - df_stuctural['pred']
    df_stuctural['diff'] = df_stuctural['diff'].apply(lambda x: round(x, 4))
    #return pd.value_counts(df_stuctural['diff'].astype(object)).to_frame().reset_index()
    return df_stuctural['diff'].describe()

In [183]:
perfct_sep(df_stuctural, df, 'urban_/_rural')

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


count    401.000000
mean      -0.000422
std        0.066652
min       -0.578600
25%        0.000000
50%        0.000000
75%        0.000000
max        0.459300
Name: diff, dtype: float64

In [184]:
perfct_sep(df_stuctural, df, 'east_west')

  return f(*args, **kwargs)


count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64

In [196]:
perfct_sep(df_stuctural, df, 'eligible_area')

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


count    401.000000
mean      -0.000192
std        0.119332
min       -0.561200
25%        0.000000
50%        0.000000
75%        0.000000
max        0.999900
Name: diff, dtype: float64

In [206]:
df_stuctural.corr()['eligible_area'][df_stuctural.corr()['eligible_area'] > 0.35]

Commute_within_150km    0.396218
Commute_within_300km    0.402465
labor_market_type       0.374534
east_west               0.568270
eligible_area           1.000000
Name: eligible_area, dtype: float64