Aleksandra Bednarczuk

# Predictors of elderly citizens willingness to stay in the neighbourhood
# Logistic regression

Original dataset, including barris among predictors

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
from imblearn.over_sampling import SMOTE

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import r2_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

from scipy import stats

In [3]:
data = pd.read_pickle("survey_transformed.pkl")

In [4]:
data.reset_index(drop=True, inplace=True)

In [5]:
data.head()

Unnamed: 0,Would_live_in_another_neighborhood,District,Woman,Car,Support_with_housework,Owner,House_size,Living_alone,Financial_situation,Disability,...,Barri_comparison,Barri_association,Barri_transport,BCN_transport,BCN_over_barri_evolution,BCN_over_barri_future,BCN_over_barri_satisfaction,BCN_over_barri_transport,BCN_over_barri_management,BCN_over_barri_security
0,0.0,SANTS - MONTJUÏC,0,1,0.0,1.0,51-100 M2,0,GOOD,0.0,...,ONE OF THE BEST,1.0,GOOD,GOOD,0,0,0,0,0,0
1,0.0,SANTS - MONTJUÏC,1,0,0.0,1.0,51-100 M2,1,BAD,0.0,...,ONE OF THE BEST,0.0,GOOD,GOOD,1,0,0,0,0,0
2,0.0,SANTS - MONTJUÏC,1,0,1.0,0.0,51-100 M2,1,GOOD,0.0,...,"NEITHER THE BEST, NOR THE WORST",0.0,BAD,"NEITHER BAD, NOR GOOD",0,0,0,1,0,0
3,1.0,SANTS - MONTJUÏC,0,0,0.0,1.0,< 50 M2,0,BAD,1.0,...,"NEITHER THE BEST, NOR THE WORST",0.0,"NEITHER BAD, NOR GOOD",GOOD,0,0,1,0,0,0
4,0.0,SANTS - MONTJUÏC,1,0,1.0,1.0,51-100 M2,0,GOOD,0.0,...,"NEITHER THE BEST, NOR THE WORST",0.0,GOOD,GOOD,0,1,0,0,0,0


In [6]:
data.shape

(1174, 32)

In [7]:
data.columns.tolist()

['Would_live_in_another_neighborhood',
 'District',
 'Woman',
 'Car',
 'Support_with_housework',
 'Owner',
 'House_size',
 'Living_alone',
 'Financial_situation',
 'Disability',
 'Living_with_disabled_person',
 'Time_living_in_barri',
 'Barri_evolution',
 'BCN_evolution',
 'Barri_future',
 'BCN_future',
 'Barri_satisfaction',
 'BCN_satisfaction',
 'Barri_security',
 'BCN_security',
 'Barri_management',
 'BCN_management',
 'Barri_comparison',
 'Barri_association',
 'Barri_transport',
 'BCN_transport',
 'BCN_over_barri_evolution',
 'BCN_over_barri_future',
 'BCN_over_barri_satisfaction',
 'BCN_over_barri_transport',
 'BCN_over_barri_management',
 'BCN_over_barri_security']

In [8]:
for i in data.columns.tolist():
    display(pd.crosstab(data[i], data.Would_live_in_another_neighborhood))

Would_live_in_another_neighborhood,0.0,1.0
Would_live_in_another_neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1019,0
1.0,0,155


Would_live_in_another_neighborhood,0.0,1.0
District,Unnamed: 1_level_1,Unnamed: 2_level_1
CIUTAT VELLA,61,8
EIXAMPLE,179,20
GRÀCIA,84,12
HORTA - GUINARDÓ,112,21
LES CORTS,104,4
NOU BARRIS,94,23
SANT ANDREU,83,22
SANT MARTÍ,129,18
SANTS - MONTJUÏC,80,22
SARRIÀ - SANT GERVASI,93,5


Would_live_in_another_neighborhood,0.0,1.0
Woman,Unnamed: 1_level_1,Unnamed: 2_level_1
0,399,61
1,620,94


Would_live_in_another_neighborhood,0.0,1.0
Car,Unnamed: 1_level_1,Unnamed: 2_level_1
0,479,63
1,540,92


Would_live_in_another_neighborhood,0.0,1.0
Support_with_housework,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,656,110
1.0,363,45


Would_live_in_another_neighborhood,0.0,1.0
Owner,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,210,28
1.0,809,127


Would_live_in_another_neighborhood,0.0,1.0
House_size,Unnamed: 1_level_1,Unnamed: 2_level_1
101-150 M2,126,15
51-100 M2,724,115
< 50 M2,57,14
> 150 M2,43,5
I DON'T KNOW,69,6


Would_live_in_another_neighborhood,0.0,1.0
Living_alone,Unnamed: 1_level_1,Unnamed: 2_level_1
0,666,103
1,353,52


Would_live_in_another_neighborhood,0.0,1.0
Financial_situation,Unnamed: 1_level_1,Unnamed: 2_level_1
BAD,390,63
GOOD,629,92


Would_live_in_another_neighborhood,0.0,1.0
Disability,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,856,121
1.0,163,34


Would_live_in_another_neighborhood,0.0,1.0
Living_with_disabled_person,Unnamed: 1_level_1,Unnamed: 2_level_1
0,722,105
1,297,50


Would_live_in_another_neighborhood,0.0,1.0
Time_living_in_barri,Unnamed: 1_level_1,Unnamed: 2_level_1
I HAVE ALWAYS LIVED IN THE NEIGHBORHOOD,136,7
LESS THAN 10 YEARS,57,16
MORE THAN 10 YEARS,826,132


Would_live_in_another_neighborhood,0.0,1.0
Barri_evolution,Unnamed: 1_level_1,Unnamed: 2_level_1
IT HAS IMPROVED,411,59
IT HAS WORSENED,230,46
IT IS THE SAME,378,50


Would_live_in_another_neighborhood,0.0,1.0
BCN_evolution,Unnamed: 1_level_1,Unnamed: 2_level_1
IT HAS IMPROVED,272,36
IT HAS WORSENED,517,85
IT IS THE SAME,230,34


Would_live_in_another_neighborhood,0.0,1.0
Barri_future,Unnamed: 1_level_1,Unnamed: 2_level_1
IT WILL IMPROVE,577,79
IT WILL STAY THE SAME,288,45
IT WILL WORSEN,154,31


Would_live_in_another_neighborhood,0.0,1.0
BCN_future,Unnamed: 1_level_1,Unnamed: 2_level_1
IT WILL IMPROVE,546,92
IT WILL STAY THE SAME,241,32
IT WILL WORSEN,232,31


Would_live_in_another_neighborhood,0.0,1.0
Barri_satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
FAIRLY SATISFIED,75,25
NOT SATISFIED,1,9
SATISFIED,943,121


Would_live_in_another_neighborhood,0.0,1.0
BCN_satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
FAIRLY SATISFIED,118,22
NOT SATISFIED,8,1
SATISFIED,893,132


Would_live_in_another_neighborhood,0.0,1.0
Barri_security,Unnamed: 1_level_1,Unnamed: 2_level_1
BAD,152,40
GOOD,455,49
"NEITHER BAD, NOR GOOD",412,66


Would_live_in_another_neighborhood,0.0,1.0
BCN_security,Unnamed: 1_level_1,Unnamed: 2_level_1
BAD,199,40
GOOD,285,38
"NEITHER BAD, NOR GOOD",535,77


Would_live_in_another_neighborhood,0.0,1.0
Barri_management,Unnamed: 1_level_1,Unnamed: 2_level_1
BAD,65,17
GOOD,509,56
"NEITHER BAD, NOR GOOD",445,82


Would_live_in_another_neighborhood,0.0,1.0
BCN_management,Unnamed: 1_level_1,Unnamed: 2_level_1
BAD,96,21
GOOD,431,46
"NEITHER BAD, NOR GOOD",492,88


Would_live_in_another_neighborhood,0.0,1.0
Barri_comparison,Unnamed: 1_level_1,Unnamed: 2_level_1
"NEITHER THE BEST, NOR THE WORST",341,94
ONE OF THE BEST,649,41
ONE OF THE WORST,29,20


Would_live_in_another_neighborhood,0.0,1.0
Barri_association,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,805,128
1.0,214,27


Would_live_in_another_neighborhood,0.0,1.0
Barri_transport,Unnamed: 1_level_1,Unnamed: 2_level_1
BAD,149,22
GOOD,812,120
"NEITHER BAD, NOR GOOD",58,13


Would_live_in_another_neighborhood,0.0,1.0
BCN_transport,Unnamed: 1_level_1,Unnamed: 2_level_1
BAD,95,21
GOOD,824,125
"NEITHER BAD, NOR GOOD",100,9


Would_live_in_another_neighborhood,0.0,1.0
BCN_over_barri_evolution,Unnamed: 1_level_1,Unnamed: 2_level_1
0,913,140
1,106,15


Would_live_in_another_neighborhood,0.0,1.0
BCN_over_barri_future,Unnamed: 1_level_1,Unnamed: 2_level_1
0,871,126
1,148,29


Would_live_in_another_neighborhood,0.0,1.0
BCN_over_barri_satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1
0,968,126
1,51,29


Would_live_in_another_neighborhood,0.0,1.0
BCN_over_barri_transport,Unnamed: 1_level_1,Unnamed: 2_level_1
0,921,146
1,98,9


Would_live_in_another_neighborhood,0.0,1.0
BCN_over_barri_management,Unnamed: 1_level_1,Unnamed: 2_level_1
0,983,147
1,36,8


Would_live_in_another_neighborhood,0.0,1.0
BCN_over_barri_security,Unnamed: 1_level_1,Unnamed: 2_level_1
0,973,145
1,46,10


In [10]:
data.columns.tolist()

['Would_live_in_another_neighborhood',
 'District',
 'Woman',
 'Car',
 'Support_with_housework',
 'Owner',
 'House_size',
 'Living_alone',
 'Financial_situation',
 'Disability',
 'Living_with_disabled_person',
 'Time_living_in_barri',
 'Barri_evolution',
 'BCN_evolution',
 'Barri_future',
 'BCN_future',
 'Barri_satisfaction',
 'BCN_satisfaction',
 'Barri_security',
 'BCN_security',
 'Barri_management',
 'BCN_management',
 'Barri_comparison',
 'Barri_association',
 'Barri_transport',
 'BCN_transport',
 'BCN_over_barri_evolution',
 'BCN_over_barri_future',
 'BCN_over_barri_satisfaction',
 'BCN_over_barri_transport',
 'BCN_over_barri_management',
 'BCN_over_barri_security']

In [135]:
x_var = [#'District',
 #'Woman',
 'Car',
 #'Support_with_housework',
 #'Owner',
 #'House_size',
 #'Living_alone',
 #'Financial_situation',
 #'Disability',
 #'Living_with_disabled_person',
 'Time_living_in_barri',
 #'Barri_evolution',
 #'BCN_evolution',
 #'Barri_future',
 #'BCN_future',
 'Barri_satisfaction',
 #'BCN_satisfaction',
 'Barri_security',
 #'BCN_security',
 #'Barri_management',
 #'BCN_management',
 'Barri_comparison']
 #'Barri_association']
 #'Barri_transport']
 #'BCN_transport',
 #'BCN_over_barri_evolution',
 #'BCN_over_barri_future',
 #'BCN_over_barri_satisfaction',
 #'BCN_over_barri_transport',
 #'BCN_over_barri_management',
 #'BCN_over_barri_security']

In [136]:
y = data['Would_live_in_another_neighborhood']

In [137]:
x = pd.get_dummies(data[x_var])

In [138]:
for i in [#District_EIXAMPLE',
          #'House_size_< 50 M2',
          #'Financial_situation_GOOD',
          'Time_living_in_barri_LESS THAN 10 YEARS',
          #'Barri_evolution_IT HAS IMPROVED',
          #'Barri_future_IT WILL IMPROVE',
          'Barri_satisfaction_SATISFIED',
          #'Barri_security_GOOD',
          #'Barri_management_GOOD',
          'Barri_comparison_ONE OF THE BEST']:
          #'Barri_transport_GOOD']:
    del x[i]

In [139]:
x = sm.add_constant(x)

In [140]:
# logit_model = sm.Logit(y,x)
# result=logit_model.fit()
# print(result.summary2())

In [141]:
# logit_roc_auc = roc_auc_score(y, result.predict(x))
# fpr, tpr, thresholds = roc_curve(y, result.predict(x))
# plt.figure()
# plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
# plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic')
# plt.legend(loc="lower right")
# #plt.savefig('Log_ROC')
# plt.show()

In [142]:
# result.pvalues.sort_values(ascending=False)

In [143]:
# display(result.get_margeff().summary())

In [144]:
# result.pred_table()

# Logit

In [145]:
glm_logit = sm.GLM(y, x, family=sm.families.Binomial())
glm_logit_results = glm_logit.fit()
glm_logit_results.summary()

0,1,2,3
Dep. Variable:,Would_live_in_another_neighborhood,No. Observations:,1174.0
Model:,GLM,Df Residuals:,1163.0
Model Family:,Binomial,Df Model:,10.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-395.88
Date:,"Sat, 08 May 2021",Deviance:,791.75
Time:,23:03:03,Pearson chi2:,1210.0
No. Iterations:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.5363,0.257,-5.975,0.000,-2.040,-1.032
Car,0.2615,0.189,1.381,0.167,-0.110,0.633
Disability,0.2377,0.233,1.021,0.307,-0.219,0.694
Time_living_in_barri_I HAVE ALWAYS LIVED IN THE NEIGHBORHOOD,-1.9861,0.513,-3.868,0.000,-2.992,-0.980
Time_living_in_barri_MORE THAN 10 YEARS,-0.8330,0.321,-2.599,0.009,-1.461,-0.205
Barri_satisfaction_FAIRLY SATISFIED,0.5267,0.275,1.915,0.056,-0.013,1.066
Barri_satisfaction_NOT SATISFIED,3.2966,1.106,2.980,0.003,1.129,5.465
Barri_security_BAD,-0.3075,0.192,-1.600,0.110,-0.684,0.069
Barri_security_GOOD,-0.6790,0.143,-4.735,0.000,-0.960,-0.398


In [146]:
glm_logit_results.aic

813.7514278427351

In [147]:
glm_logit_results.bic

-7428.532608608557

In [148]:
glm_logit_results.pvalues.sort_values(ascending=False)

Disability                                                      3.074641e-01
Car                                                             1.671686e-01
Barri_security_BAD                                              1.096560e-01
Barri_satisfaction_FAIRLY SATISFIED                             5.555611e-02
Time_living_in_barri_MORE THAN 10 YEARS                         9.349505e-03
Barri_satisfaction_NOT SATISFIED                                2.878655e-03
Barri_security_NEITHER BAD, NOR GOOD                            3.277847e-04
Time_living_in_barri_I HAVE ALWAYS LIVED IN THE NEIGHBORHOOD    1.096507e-04
Barri_comparison_ONE OF THE WORST                               1.496911e-05
Barri_security_GOOD                                             2.189123e-06
const                                                           2.303120e-09
Barri_comparison_NEITHER THE BEST, NOR THE WORST                3.975518e-11
dtype: float64

# Probit

In [46]:
glm_probit = sm.GLM(y, x, family=sm.families.Binomial(link=sm.genmod.families.links.probit))
glm_probit_results = glm_probit.fit()
glm_probit_results.summary()

Use an instance of a link class instead.
  """Entry point for launching an IPython kernel.


0,1,2,3
Dep. Variable:,Would_live_in_another_neighborhood,No. Observations:,1174.0
Model:,GLM,Df Residuals:,1135.0
Model Family:,Binomial,Df Model:,38.0
Link Function:,probit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-383.63
Date:,"Sat, 08 May 2021",Deviance:,767.26
Time:,22:54:47,Pearson chi2:,1290.0
No. Iterations:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.0424,0.316,-3.294,0.001,-1.663,-0.422
Woman,-0.0187,0.111,-0.169,0.866,-0.237,0.199
Car,0.1950,0.123,1.592,0.111,-0.045,0.435
Support_with_housework,-0.0025,0.119,-0.021,0.984,-0.236,0.231
Owner,0.0834,0.146,0.571,0.568,-0.203,0.370
Living_alone,0.0608,0.127,0.478,0.633,-0.189,0.310
Disability,0.1915,0.198,0.966,0.334,-0.197,0.580
Living_with_disabled_person,-0.1335,0.169,-0.790,0.430,-0.465,0.198
Barri_association,-0.1860,0.134,-1.390,0.165,-0.448,0.076


In [41]:
glm_probit_results.aic

845.2576345712489

In [42]:
glm_probit_results.bic

-7255.1175858691795

# Scobit

In [27]:
glm_scobit = sm.GLM(y, x, family=sm.families.Binomial(link=sm.genmod.families.links.cloglog))
glm_scobit_results = glm_scobit.fit()
glm_scobit_results.summary()

Use an instance of a link class instead.
  """Entry point for launching an IPython kernel.


0,1,2,3
Dep. Variable:,Would_live_in_another_neighborhood,No. Observations:,1174.0
Model:,GLM,Df Residuals:,1135.0
Model Family:,Binomial,Df Model:,38.0
Link Function:,cloglog,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-382.78
Date:,"Sat, 08 May 2021",Deviance:,765.56
Time:,22:47:26,Pearson chi2:,1250.0
No. Iterations:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.9848,0.502,-3.952,0.000,-2.969,-1.000
Woman,-0.0511,0.182,-0.281,0.779,-0.407,0.305
Car,0.2576,0.202,1.274,0.203,-0.139,0.654
Support_with_housework,0.0509,0.197,0.259,0.796,-0.335,0.437
Owner,0.0999,0.245,0.408,0.683,-0.380,0.580
Living_alone,0.0695,0.211,0.329,0.742,-0.344,0.483
Disability,0.3260,0.327,0.998,0.318,-0.314,0.966
Living_with_disabled_person,-0.2289,0.284,-0.806,0.420,-0.786,0.328
Barri_association,-0.4129,0.226,-1.830,0.067,-0.855,0.029


In [43]:
glm_scobit_results.aic

843.5578864943819

In [44]:
glm_scobit_results.bic

-7256.817333946046

In [28]:
np.exp(glm_scobit_results.params)

const                                                            0.137415
Woman                                                            0.950209
Car                                                              1.293787
Support_with_housework                                           1.052228
Owner                                                            1.105049
Living_alone                                                     1.071970
Disability                                                       1.385367
Living_with_disabled_person                                      0.795433
Barri_association                                                0.661723
District_CIUTAT VELLA                                            1.103064
District_GRÀCIA                                                  1.669316
District_HORTA - GUINARDÓ                                        1.890624
District_LES CORTS                                               0.495594
District_NOU BARRIS                   