# Logistic Regression

## Import libraries and data

In [82]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
from seaborn import regplot

import statsmodels.formula.api as smf
import statsmodels.api as sm
import statsmodels.stats.multicomp as multi

import scipy
from scipy import stats
from scipy.stats import pearsonr

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import researchpy as rp

import plotly.express as px

In [83]:
df = pd.read_csv('data.csv')

In [84]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [85]:
df = df.rename(columns={'PatientId':'patient_id', 'AppointmentID':'appointment_id', 'Gender':'gender', 
                        'ScheduledDay':'scheduled_day', 'AppointmentDay':'appointment_day', 'Age': 'age', 
                        'Neighbourhood':'neighbourhood', 'Scholarship':'scholarship', 'Hipertension':'hipertension', 
                        'Diabetes':'diabetes', 'Alcoholism':'alcoholism', 'Handcap':'handicap', 'SMS_received':'sms', 
                        'No-show':'show_up'})

In [86]:
df.head()

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handicap,sms,show_up
0,29872499824296,5642903,0,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,1
1,558997776694438,5642503,1,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,1
2,4262962299951,5642549,0,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,1
3,867951213174,5642828,0,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1
4,8841186448183,5642494,0,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,1


In [87]:
# dummy variables necessary?
# neighbourhood?
# visualize data?
# improve prediction
# split data
# include dates (different formats)
# repeating patient_id - differences?
# delete appointment id
# format age to bins

In [88]:
df.neighbourhood.value_counts()

JARDIM CAMBURI                 7717
MARIA ORTIZ                    5804
RESISTÊNCIA                    4431
JARDIM DA PENHA                3877
ITARARÉ                        3514
                               ... 
ILHA DO BOI                      35
ILHA DO FRADE                    10
AEROPORTO                         8
ILHAS OCEÂNICAS DE TRINDADE       2
PARQUE INDUSTRIAL                 1
Name: neighbourhood, Length: 81, dtype: int64

## Implement logistic regression model

In [89]:
Y = df.show_up

In [90]:
X 
= df.drop(columns=['patient_id', 'appointment_id', 'scheduled_day', 'appointment_day', 'neighbourhood', 'show_up'])

SyntaxError: invalid syntax (<ipython-input-90-9bc9d3db8192>, line 2)

In [91]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [92]:
results = model.fit(X,Y)

In [93]:
model.classes_

array([0, 1])

In [94]:
model.intercept_

array([1.38337123])

In [95]:
model.coef_

array([[ 0.01791095,  0.00660273, -0.18430819,  0.06641824, -0.0851067 ,
        -0.13930778, -0.02505656, -0.64967114]])

In [96]:
model.score(X,Y)

0.7980799681502728

In [97]:
confusion_matrix(Y, model.predict(X))

array([[    0, 22316],
       [    0, 88203]])

In [98]:
print(classification_report(Y, model.predict(X)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     22316
           1       0.80      1.00      0.89     88203

    accuracy                           0.80    110519
   macro avg       0.40      0.50      0.44    110519
weighted avg       0.64      0.80      0.71    110519



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________

In [107]:
X_small = X.drop(columns=['age', 'hipertension', 'diabetes', 'alcoholism', 'handicap', 'sms'])

In [108]:
model_small = LogisticRegression(solver='liblinear', random_state=0)

In [109]:
results_small = model_small.fit(X_small,Y)

In [110]:
model.coef_

array([[ 0.0041555 , -0.23030906]])

## Implement linear regression model

In [70]:
#X = sm.add_constant(X)

In [71]:
model = sm.OLS(Y,X)

In [72]:
results = model.fit()

In [73]:
results.summary()

0,1,2,3
Dep. Variable:,show_up,R-squared (uncentered):,0.655
Model:,OLS,Adj. R-squared (uncentered):,0.655
Method:,Least Squares,F-statistic:,26260.0
Date:,"Thu, 23 Jun 2022",Prob (F-statistic):,0.0
Time:,12:19:46,Log-Likelihood:,-85502.0
No. Observations:,110519,AIC:,171000.0
Df Residuals:,110511,BIC:,171100.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
gender,0.3217,0.003,105.541,0.000,0.316,0.328
age,0.0137,5.54e-05,247.459,0.000,0.014,0.014
scholarship,0.2782,0.005,53.655,0.000,0.268,0.288
hipertension,-0.1379,0.005,-28.373,0.000,-0.147,-0.128
diabetes,-0.0539,0.007,-7.907,0.000,-0.067,-0.041
alcoholism,-0.0991,0.009,-10.626,0.000,-0.117,-0.081
handicap,0.0124,0.010,1.261,0.207,-0.007,0.032
sms,0.1123,0.003,34.786,0.000,0.106,0.119

0,1,2,3
Omnibus:,7327.709,Durbin-Watson:,1.527
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8896.632
Skew:,-0.694,Prob(JB):,0.0
Kurtosis:,2.943,Cond. No.,272.0


______________________

In [114]:
X_small = sm.add_constant(X_small)

In [115]:
model_OLS_small = sm.OLS(Y,X_small)

In [116]:
results = model_OLS_small.fit()

In [117]:
results.summary()

0,1,2,3
Dep. Variable:,show_up,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,47.02
Date:,"Thu, 23 Jun 2022",Prob (F-statistic):,3.89e-21
Time:,12:27:13,Log-Likelihood:,-55900.0
No. Observations:,110519,AIC:,111800.0
Df Residuals:,110516,BIC:,111800.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.8017,0.002,507.596,0.000,0.799,0.805
gender,0.0006,0.003,0.253,0.801,-0.004,0.006
scholarship,-0.0392,0.004,-9.601,0.000,-0.047,-0.031

0,1,2,3
Omnibus:,22829.012,Durbin-Watson:,1.788
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40722.707
Skew:,-1.483,Prob(JB):,0.0
Kurtosis:,3.206,Cond. No.,3.7
