## Preprocessing

### Viewing Data

In [2]:
#  Import and read the charity_data.csv.
import pandas as pd 
data_df = pd.read_csv("processeddata.csv")
data_df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,29872500000000.0,5642903,1,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,4,0
1,558997800000000.0,5642503,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,4,0
2,4262962000000.0,5642549,1,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,4,0
3,867951200000.0,5642828,1,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,4,0
4,8841186000000.0,5642494,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,1,4,0


In [4]:
data_df.dtypes

PatientId         float64
AppointmentID       int64
Gender              int64
ScheduledDay       object
AppointmentDay     object
Age                 int64
Neighbourhood      object
Scholarship         int64
Hipertension        int64
Diabetes            int64
Alcoholism          int64
Handcap             int64
SMS_received        int64
No-show             int64
WeekDay             int64
TimeDelta           int64
dtype: object

### Datatype Changing

In [7]:
data_df['PatientId'] = data_df['PatientId'].astype('object')
data_df.dtypes

PatientId         object
AppointmentID      int64
Gender             int64
ScheduledDay      object
AppointmentDay    object
Age                int64
Neighbourhood     object
Scholarship        int64
Hipertension       int64
Diabetes           int64
Alcoholism         int64
Handcap            int64
SMS_received       int64
No-show            int64
WeekDay            int64
TimeDelta          int64
dtype: object

In [8]:
data_df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,29872500000000.0,5642903,1,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,4,0
1,558998000000000.0,5642503,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,4,0
2,4262960000000.0,5642549,1,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,4,0
3,867951000000.0,5642828,1,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,4,0
4,8841190000000.0,5642494,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,1,4,0


### Dropping Columns

In [9]:
data_df_test = data_df.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood', 'TimeDelta'],axis=1)
data_df_test.head()

Unnamed: 0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay
0,1,62,0,1,0,0,0,0,1,4
1,0,56,0,0,0,0,0,0,1,4
2,1,62,0,0,0,0,0,0,1,4
3,1,8,0,0,0,0,0,0,1,4
4,1,56,0,1,1,0,0,0,1,4


## Model Set up

### Import Model

In [16]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)

### Data Split

In [12]:
X = data_df_test.drop(['No-show'], axis = 1)
y = data_df_test["No-show"]

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

### Model Running

In [17]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [18]:
y_pred = classifier.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8016395240445249


In [None]:
# The factors of gender, age, scholarship, hyperension, diabetes, alcholoism, handicap, SMS-recieved and weekday
# have impact on the No-Show likelihood