In [1]:
import os
import mlflow
import numpy as np
import pandas as pd

from math import sqrt
from joblib import delayed
from joblib import Parallel

from warnings import catch_warnings
from warnings import filterwarnings
from multiprocessing import cpu_count
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from matplotlib import pyplot
import matplotlib.dates as mdates

from pathlib import Path
from functools import reduce
from datetime import datetime

import random
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression, RidgeCV, Ridge, LogisticRegression
from regressors import stats
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

## Upload unemployment rate

In [49]:
df = pd.read_csv('./../../final_dfs/for_modeling/df_final_date_wide_2007.csv', converters={'ags2': str, 'ags5': str} )
df.set_index('ags5', drop=True, inplace=True)
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df = df[df['date'] >= '2007-05-01']
df = df[['date','unemployment_rate']]
df

Unnamed: 0_level_0,date,unemployment_rate
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,2007-05-01,12.7
1001,2007-06-01,12.2
1001,2007-07-01,12.5
1001,2007-08-01,12.3
1001,2007-09-01,11.7
...,...,...
16073,2021-05-01,6.0
16074,2021-05-01,4.6
16075,2021-05-01,4.8
16076,2021-05-01,4.9


## Upload structural variables 

In [90]:
df_stuctural = pd.read_csv('./../../final_dfs/for_modeling/df_final_stationery.csv', converters={'ags2': str, 'ags5': str} )
#df_stuctural.set_index('ags5', drop=True, inplace=True)
df_stuctural = df_stuctural.drop(['grw_funding_framework',"Unnamed:_0",'support_area_status','debtor_quota'], axis=1)
df_stuctural['urban_/_rural'] = df_stuctural['urban_/_rural'] - 1
df_stuctural['east_west'] = df_stuctural['east_west'] - 1
df_stuctural

Unnamed: 0,cluster,kreis,ags5,ags2,supermarkets_population,supermarkets_average_distance,public_transport_availability,average_distance_bus_stop,average_distance_train_station,average_distance_public_transport,...,settlement_structure_type_of_labor_market_region,room_type_location,district_settlement_structure,type_of_settlement_structure,urban_/_rural,metropolitan_region,metropolitan_area,east_west,border_proximity,eligible_area
0,0,"Flensburg, Stadt",01001,01,92,500,35,240,2901,240,...,3,2,4,3,1,99,99,0,1,1
1,2,"Kiel, Landeshauptstadt",01002,01,92,460,37,268,2037,265,...,1,2,1,2,0,99,99,0,0,1
2,2,"Lübeck, Hansestadt",01003,01,90,532,37,297,1927,294,...,1,1,1,2,0,5,99,0,0,1
3,0,"Neumünster, Stadt",01004,01,85,588,37,316,1648,313,...,1,2,3,2,1,5,99,0,0,1
4,0,Dithmarschen,01051,01,51,1864,35,448,3517,443,...,3,4,4,3,1,5,99,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,0,Saalfeld-Rudolstadt,16073,16,62,1423,37,341,2492,322,...,3,3,4,2,1,99,99,1,0,1
397,0,Saale-Holzland-Kreis,16074,16,54,1677,38,343,3419,340,...,1,3,3,2,1,99,99,1,0,1
398,0,Saale-Orla-Kreis,16075,16,54,1686,35,449,3172,444,...,3,3,4,2,1,99,99,1,2,1
399,0,Greiz,16076,16,55,1654,38,608,2779,563,...,1,3,2,2,0,99,99,1,0,1


In [31]:
list(df_stuctural.columns)

['cluster',
 'kreis',
 'ags5',
 'ags2',
 'supermarkets_population',
 'supermarkets_average_distance',
 'public_transport_availability',
 'average_distance_bus_stop',
 'average_distance_train_station',
 'average_distance_public_transport',
 'number_of_students',
 'number_of_hospitals',
 'number_of_hospital_beds',
 'number_of_hospital_beds_adj',
 'hospital_patiants',
 'households_of_1_person',
 'households_of_2_person',
 'households_of_3_person',
 'households_of_4_person',
 'households_of_5_person_or_more',
 'household_with_kids',
 'household_with_kids_under_3',
 'household_with_kids_over_3_under_6',
 'household_with_kids_over_6_under_10',
 'household_with_kids_over_10_under_15',
 'household_with_kids_over_15_under_18',
 'household_with_double_income_no_kids',
 'car_density',
 'no_of_paths_per_person_and_day',
 'kilometers_per_person_and_day',
 '_percentage_out_of_home',
 'share_of_journeys_on_foot',
 'share_of_journeys_on_bike',
 'proportion_of_motorised_vehicle_passenger',
 'share_of_m

## ATE for binary columns

In [84]:
def ATE_IPTW(df_stuctural, df, column):
    #first, caculate PS 
    X = df_stuctural.drop(['kreis',column], axis=1)
    Y = df_stuctural[column].astype(str)
        
    #dummy the categorical vars in X 
    dummies = ['east_west', 'eligible_area', 'urban_/_rural']
    if column in dummies:
        dummies.remove(column)
    for dummy in dummies:
        X[dummy] = X[dummy].astype(str)
    X['cluster'] = X['cluster'].astype(str)

    X = pd.get_dummies(data=X, drop_first=True)
    Y = pd.get_dummies(data=Y, drop_first=True)

    regr = LogisticRegression(random_state=0, max_iter=10000)
    regr.fit(X, Y, sample_weight=None)
    coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(regr.coef_))], axis = 1)
    coefficients.columns = ['variable', 'coef']
    coefficients = coefficients.sort_values(by=['coef'])
    pred = regr.predict_proba(X)[:, 1]
    df_stuctural['pred'] = pred.tolist()
    
    df_mixed = pd.merge(df, df_stuctural, left_index=True, right_index=True)

    Y_1 = np.mean(df_mixed[column]*df_mixed['unemployment_rate']/df_mixed['pred'])
    Y_0 = np.mean((1-df_mixed[column])*df_mixed['unemployment_rate']/(1-df_mixed['pred']))
    return Y_1 - Y_0

In [85]:
 ATE_IPTW(df_stuctural, df, 'eligible_area')

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


nan

In [86]:
 ATE_IPTW(df_stuctural, df, 'east_west')

  return f(*args, **kwargs)


nan

In [87]:
 ATE_IPTW(df_stuctural, df, 'urban_/_rural')

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


nan

## Check for prefect seperation 

In [70]:
def perfct_sep(df_stuctural, df, column):
    #first, caculate PS 
    X = df_stuctural.drop(['kreis',column], axis=1)
    Y = df_stuctural[column].astype(str)
    
    #dummy the categorical vars in X 
    dummies = ['east_west', 'eligible_area', 'urban_/_rural']
    if column in dummies:
        dummies.remove(column)
    for dummy in dummies:
        X[dummy] = X[dummy].astype(str)
    X['cluster'] = X['cluster'].astype(str)

    X = pd.get_dummies(data=X, drop_first=True)
    Y = pd.get_dummies(data=Y, drop_first=True)

    regr = LogisticRegression(random_state=0, max_iter=10000)
    regr.fit(X, Y, sample_weight=None)
    coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(regr.coef_))], axis = 1)
    coefficients.columns = ['variable', 'coef']
    coefficients = coefficients.sort_values(by=['coef'])
    pred = regr.predict_proba(X)[:, 1]
    df_stuctural['pred'] = pred.tolist()
     
    df_stuctural['diff'] = df_stuctural[column] - df_stuctural['pred']
    df_stuctural['diff'] = df_stuctural['diff'].apply(lambda x: round(x, 4))
    #return pd.value_counts(df_stuctural['diff'].astype(object)).to_frame().reset_index()
    return df_stuctural['diff'].describe()

In [71]:
perfct_sep(df_stuctural, df, 'urban_/_rural')

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


count    401.000000
mean       0.000164
std        0.089726
min       -0.700600
25%        0.000000
50%        0.000000
75%        0.000000
max        0.497700
Name: diff, dtype: float64

In [12]:
perfct_sep(df_stuctural, df, 'east_west')

  return f(*args, **kwargs)


count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64

In [13]:
perfct_sep(df_stuctural, df, 'eligible_area')

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


count    401.000000
mean       0.000553
std        0.146501
min       -0.675700
25%        0.000000
50%        0.000000
75%        0.000000
max        0.985200
Name: diff, dtype: float64

In [14]:
df_stuctural.corr()['eligible_area'][df_stuctural.corr()['eligible_area'] > 0.35]

Commute_within_150km    0.396218
Commute_within_300km    0.402465
labor_market_type       0.374534
east_west               0.568270
eligible_area           1.000000
pred                    0.955556
diff                    0.381998
Name: eligible_area, dtype: float64

## Covid measurements 

In [88]:
df_covid = pd.read_csv('./covid_measurements_main.csv', converters={'ags2': str, 'ags5': str} )
df_covid = df_covid.drop(['bundesland'], axis=1)

In [89]:
df_covid_mean = df_covid.groupby(['ags5','ags2','kreis']).mean()
df_covid_mean = df_covid_mean.reset_index(drop=False)

#### merge into structural

In [53]:
df_stuctural = pd.merge(df_stuctural, df_covid_mean, left_on = ['kreis', 'ags2', 'ags5'], right_on = ['kreis', 'ags2', 'ags5'])
df_stuctural

Unnamed: 0,cluster,kreis,ags5,ags2,supermarkets_population,supermarkets_average_distance,public_transport_availability,average_distance_bus_stop,average_distance_train_station,average_distance_public_transport,...,public_and_indoor_events,public_and_outdoor_events,secondary_schools,service_and_crafts,sport_indoor,sport_outdoor,test_measures,travel_abroad,travel_domestic,wholesale_retail
0,0,"Flensburg, Stadt",01001,01,92,500,35,240,2901,240,...,0.936508,0.936508,0.496599,0.911565,0.913832,0.913832,0.002268,0.0,0.0,0.922902
1,2,"Kiel, Landeshauptstadt",01002,01,92,460,37,268,2037,265,...,0.891156,0.891156,0.315193,0.907029,0.907029,0.907029,0.034014,0.0,0.0,0.916100
2,2,"Lübeck, Hansestadt",01003,01,90,532,37,297,1927,294,...,0.916100,0.916100,0.469388,0.916100,0.907029,0.907029,0.002268,0.0,0.0,0.916100
3,0,"Neumünster, Stadt",01004,01,85,588,37,316,1648,313,...,0.891156,0.891156,0.315193,0.907029,0.907029,0.907029,0.002268,0.0,0.0,0.916100
4,0,Dithmarschen,01051,01,51,1864,35,448,3517,443,...,0.927438,0.927438,0.532880,0.907029,0.907029,0.907029,0.002268,0.0,0.0,0.916100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,0,Saalfeld-Rudolstadt,16073,16,62,1423,37,341,2492,322,...,0.922902,0.922902,0.326531,0.918367,0.596372,0.596372,0.149660,0.0,0.0,0.591837
397,0,Saale-Holzland-Kreis,16074,16,54,1677,38,343,3419,340,...,0.922902,0.922902,0.326531,0.918367,0.614512,0.614512,0.149660,0.0,0.0,0.591837
398,0,Saale-Orla-Kreis,16075,16,54,1686,35,449,3172,444,...,0.922902,0.922902,0.383220,0.918367,0.596372,0.596372,0.149660,0.0,0.0,0.591837
399,0,Greiz,16076,16,55,1654,38,608,2779,563,...,0.922902,0.922902,0.358277,0.918367,0.596372,0.596372,0.149660,0.0,0.0,0.591837


#### Check for perfect seperation for covid restrictions

In [80]:
c = ['accommodation',
       #'capacity_limitation_in_public_traffic',
       'culture_and_educational_institution', 'curfew', 'daycare_centers',
       'distance_regulation ', 'gastronomy', 'job_restriction',
       'mask_requirement', 'meeting_restrictions_private',
       'meeting_restrictions_public', 'nightlife', 'primary_schools',
       'public_and_indoor_events', 'public_and_outdoor_events',
       'secondary_schools', 'service_and_crafts', 'sport_indoor',
       'sport_outdoor', 'test_measures', 'travel_abroad', 'travel_domestic',
       'wholesale_retail']
for i in c:
    df_stuctural['y_val'] = df_stuctural[i].astype(float)
    mode_col = df_stuctural['y_val'].mode()[0]
    df_stuctural['y_val'] = df_stuctural['y_val'] > mode_col
    print(perfct_sep(df_stuctural, df,'y_val'))
    print(i)

  return f(*args, **kwargs)


count    401.00000
mean       0.00000
std        0.00001
min       -0.00010
25%        0.00000
50%        0.00000
75%        0.00000
max        0.00010
Name: diff, dtype: float64
accommodation


  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean      -0.001316
std        0.189895
min       -0.974600
25%       -0.000900
50%        0.000000
75%        0.000300
max        0.988500
Name: diff, dtype: float64
culture_and_educational_institution


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean      -0.000082
std        0.195169
min       -0.928800
25%       -0.001100
50%        0.000000
75%        0.007200
max        0.995100
Name: diff, dtype: float64
curfew


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


count    401.000000
mean      -0.000163
std        0.125571
min       -0.718300
25%        0.000000
50%        0.000000
75%        0.000000
max        0.999800
Name: diff, dtype: float64
daycare_centers
count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64
distance_regulation 
count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64
gastronomy


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)
  return f(*args, **kwargs)


count    401.000000
mean       0.000813
std        0.176850
min       -0.920800
25%       -0.005500
50%        0.000000
75%        0.000500
max        0.991700
Name: diff, dtype: float64
job_restriction
count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64
mask_requirement


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean      -0.000199
std        0.147217
min       -0.739700
25%       -0.000300
50%        0.000000
75%        0.000000
max        0.982100
Name: diff, dtype: float64
meeting_restrictions_private


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean       0.000020
std        0.012227
min       -0.111000
25%        0.000000
50%        0.000000
75%        0.000000
max        0.150300
Name: diff, dtype: float64
meeting_restrictions_public
count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64
nightlife


  return f(*args, **kwargs)


count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64
primary_schools


  return f(*args, **kwargs)


count    4.010000e+02
mean     4.987531e-07
std      3.239986e-05
min     -2.000000e-04
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.000000e-04
Name: diff, dtype: float64
public_and_indoor_events


  return f(*args, **kwargs)


count    4.010000e+02
mean    -2.493766e-07
std      2.597956e-05
min     -2.000000e-04
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.000000e-04
Name: diff, dtype: float64
public_and_outdoor_events
count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64
secondary_schools


  return f(*args, **kwargs)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean      -0.001759
std        0.247419
min       -0.970600
25%       -0.024400
50%        0.000000
75%        0.020600
max        0.968100
Name: diff, dtype: float64
service_and_crafts


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean       0.000005
std        0.132805
min       -0.806000
25%       -0.001300
50%        0.000000
75%        0.000000
max        0.854100
Name: diff, dtype: float64
sport_indoor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean       0.000001
std        0.002061
min       -0.012400
25%        0.000000
50%        0.000000
75%        0.000000
max        0.024400
Name: diff, dtype: float64
sport_outdoor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean       0.000245
std        0.192299
min       -0.798600
25%       -0.024700
50%       -0.000600
75%        0.001900
max        0.990000
Name: diff, dtype: float64
test_measures
count    401.0
mean       0.0
std        0.0
min       -0.0
25%        0.0
50%        0.0
75%        0.0
max       -0.0
Name: diff, dtype: float64
travel_abroad


  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


count    401.000000
mean      -0.000291
std        0.157049
min       -0.829000
25%       -0.005500
50%        0.000000
75%        0.000000
max        0.998300
Name: diff, dtype: float64
travel_domestic
count    4.010000e+02
mean     2.493766e-07
std      1.322640e-05
min     -1.000000e-04
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e-04
Name: diff, dtype: float64
wholesale_retail
