Aleksandra Bednarczuk

# Predictors of elderly citizens willingness to stay in the neighbourhood
# Logistic regression

Original dataset

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import r2_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

from scipy import stats

In [None]:
data = pd.read_pickle("survey_transformed.pkl")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# Dataset is imbalanced
data.groupby(['Would_live_in_another_neighborhood']).size()

In [None]:
data.columns.tolist()

In [None]:
x_var = ['ANY',
 'Would_live_in_another_neighborhood',
 'District',
 'Gender',
 'Place_of_birth',
 'Time_living_in_barri',
 'Barri_evolution',
 'BCN_evolution',
 'Barri_future',
 'BCN_future',
 'Barri_satisfaction_of_living',
 'BCN_satisfaction_of_living',
 'Barri_citizen_security',
 'BCN_citizen_security',
 'Barri_management',
 'BCN_management',
 'Barri_comparison',
 'Barri_transport',
 'BCN_transport',
 'Owns_a_car',
 'House_size',
 'Monthly_family_income',
 'Social_class',
 'Living_alone',
 'BCN_over_barri_evolution',
 'BCN_over_barri_future',
 'BCN_over_barri_satisfaction_of_living',
 'BCN_over_barri_transport',
 'BCN_over_barri_management',
 'BCN_over_barri_citizen_security']

In [None]:
for i in ['ANY', 
          'Would_live_in_another_neighborhood', 
          'BCN_evolution', 'BCN_future', 'BCN_satisfaction_of_living', 
          'BCN_transport', 'BCN_management', 'BCN_citizen_security']:
    x_var.remove(i)

In [None]:
x = data[x_var]

In [None]:
y = data['Would_live_in_another_neighborhood']

In [None]:
x = pd.get_dummies(x, drop_first=True)

In [None]:
# for i in x.columns.tolist():
#     if i[-5:] == '_GOOD':
#         del x[i]        

In [None]:
# for i in ['Place_of_birth_BARCELONA',
 
#  'Time_living_in_barri_I HAVE ALWAYS LIVED IN THE NEIGHBORHOOD',

#  'Barri_evolution_IT IS WORSE',
 
#  'Barri_future_IT WILL GET WORSE',
 
#  'Barri_satisfaction_of_living_NOT SATISFIED',

#  'Barri_comparison_ONE OF THE WORST',

#  'House_size_< 50 M2',

#  'Monthly_family_income_< 1000 EUROS',

#  'Social_class_LOW',

#  'Living_alone_ONE']:
#     del x[i]

In [None]:
x = sm.add_constant(x)

In [None]:
resampling = SMOTETomek(random_state=0)
 
columns = x.columns
x_resampled, y_resampled = resampling.fit_sample(x.astype(int), y)
x_resampled = pd.DataFrame(data=x_resampled, columns=columns )
y_resampled= pd.DataFrame(data=y_resampled, columns=['Would_live_in_another_neighborhood'])

print("length of oversampled data is ",len(x_resampled))
print("Number of people who would not stay in the neighborhood in resampled data",
      len(y_resampled[y_resampled['Would_live_in_another_neighborhood']==1]))
print("Number of people who would stay in the neighborhood in resampled data",
      len(y_resampled[y_resampled['Would_live_in_another_neighborhood']==0]))
print("Proportion of 'Would_not_stay_in_the_neighborhood' in resampled data is ",
      len(y_resampled[y_resampled['Would_live_in_another_neighborhood']==1])/len(x_resampled))
print("Proportion of subscription data in resampled data is ",
      len(y_resampled[y_resampled['Would_live_in_another_neighborhood']==0])/len(x_resampled))

In [None]:
x_resampled

In [None]:
y_resampled['Would_live_in_another_neighborhood'].unique()

In [None]:
logit_model = sm.Logit(y_resampled,x_resampled)
result=logit_model.fit()
print(result.summary2())

In [None]:
result.pvalues.sort_values(ascending=False)

In [None]:
logit_roc_auc = roc_auc_score(y_resampled, result.predict(x_resampled))
fpr, tpr, thresholds = roc_curve(y_resampled, result.predict(x_resampled))
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()

In [None]:
print(result.get_margeff().summary())

In [None]:
result.pred_table()

In [None]:
logit_roc_auc = roc_auc_score(y_resampled, result.predict(x_resampled))
fpr, tpr, thresholds = roc_curve(y_resampled, result.predict(x_resampled))
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()

In [None]:
np.exp(result.params)