In [1]:
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import pandas as pd 


  from numpy.core.umath_tests import inner1d


## Loading in Data

In [2]:
db_string = f"postgres://postgres:{db_password}@dataviz.cruszicqidok.us-west-1.rds.amazonaws.com:5432/dataviz"
engine = create_engine(db_string)

In [28]:
appointment_data_df = pd.read_sql_table('Appointments',engine)
appointment_data_df.head()

Unnamed: 0,AppointmentID,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,5642903,29872500000000.0,1,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,4,0
1,5642503,558998000000000.0,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,4,0
2,5642549,4262960000000.0,1,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,4,0
3,5642828,867951000000.0,1,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,4,0
4,5642494,8841190000000.0,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,1,4,0


In [29]:
neighbourhood_data_df = pd.read_sql_table('Neighbourhood_data',engine)
neighbourhood_data_df.head()

Unnamed: 0,Neighbourhood,Income,"Lat, Long"
0,AEROPORTO,600,"-20.241510, -40.275436"
1,ANDORINHAS,510,"20.287241, -40.306451"
2,ANTONIO HONORIO,755,"-20.258443, -40.298751"
3,ARIOVALDO FAVALESSA,510,"-20.312391, -40.355819"
4,BARRO VERMELHO,2000,"-20.292924, -40.296295"


## Preprocessing

### Viewing Data

In [4]:
data_df.dtypes

AppointmentID              int64
PatientId                float64
Gender                     int64
ScheduledDay      datetime64[ns]
AppointmentDay    datetime64[ns]
Age                        int64
Neighbourhood             object
Scholarship                int64
Hipertension               int64
Diabetes                   int64
Alcoholism                 int64
Handcap                    int64
SMS_received               int64
No-show                    int64
WeekDay                    int64
TimeDelta                  int64
dtype: object

### Datatype Changing

In [5]:
data_df['PatientId'] = data_df['PatientId'].astype('object')
data_df.dtypes

AppointmentID              int64
PatientId                 object
Gender                     int64
ScheduledDay      datetime64[ns]
AppointmentDay    datetime64[ns]
Age                        int64
Neighbourhood             object
Scholarship                int64
Hipertension               int64
Diabetes                   int64
Alcoholism                 int64
Handcap                    int64
SMS_received               int64
No-show                    int64
WeekDay                    int64
TimeDelta                  int64
dtype: object

In [6]:
data_df.head()

Unnamed: 0,AppointmentID,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,5642903,29872500000000.0,1,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,4,0
1,5642503,558998000000000.0,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,4,0
2,5642549,4262960000000.0,1,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,4,0
3,5642828,867951000000.0,1,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,4,0
4,5642494,8841190000000.0,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,1,4,0


### Dropping Columns

In [19]:
data_df_test = data_df.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood'],axis=1)
data_df_test.head()

Unnamed: 0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,1,62,0,1,0,0,0,0,1,4,0
1,0,56,0,0,0,0,0,0,1,4,0
2,1,62,0,0,0,0,0,0,1,4,0
3,1,8,0,0,0,0,0,0,1,4,0
4,1,56,0,1,1,0,0,0,1,4,0


## Model Set up

### Data Split

In [20]:
X_ng = data_df_test.drop(['No-show'], axis = 1)
y_ng = data_df_test["No-show"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_ng, y_ng, test_size=0.33)

### Logistic Regression

In [22]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier.fit(X_train, y_train)
y_pred_log_reg = classifier.predict(X_test)
print(f"Logistic regression accuracy: {accuracy_score(y_test,y_pred_log_reg)} and f-score of: {f1_score(y_test, y_pred_log_reg)}")


Logistic regression accuracy: 0.7948127433240116 and f-score of: 0.8852604789501121


### Random Forest Regression

In [23]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred_rf = rf_model.predict(X_test)

print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred_rf)} and f-score of: {f1_score(y_test, y_pred_rf)}")

 Random forest predictive accuracy: 0.7597192520699676 and f-score of: 0.8559500328731098


## Without Age Data

In [24]:
data_df_test_no_age = data_df_test.drop(['Age'],axis=1)
data_df_test_no_age.head()

Unnamed: 0,Gender,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,1,0,1,0,0,0,0,1,4,0
1,0,0,0,0,0,0,0,1,4,0
2,1,0,0,0,0,0,0,1,4,0
3,1,0,0,0,0,0,0,1,4,0
4,1,0,1,1,0,0,0,1,4,0


In [25]:
X = data_df_test_no_age.drop(['No-show'], axis = 1)
y = data_df_test_no_age["No-show"]
X_train_ng, X_test_ng, y_train_ng, y_test_ng = train_test_split(X, y, test_size=0.33)

## Logistic Regression

In [26]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier.fit(X_train_ng, y_train_ng)
y_pred_log_reg_ng = classifier.predict(X_test_ng)
print(f"Logistic regression accuracy: {accuracy_score(y_test,y_pred_log_reg)} and f-score of: {f1_score(y_test_ng, y_pred_log_reg_ng)}")


Logistic regression accuracy: 0.7948127433240116 and f-score of: 0.8847999754665196


## Random Forest

In [27]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_ng, y_train_ng)

# Evaluate the model
y_pred_rf = rf_model.predict(X_test_ng)

print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred_rf)} and f-score of: {f1_score(y_test_ng, y_pred_rf)}")

 Random forest predictive accuracy: 0.7721664747491364 and f-score of: 0.8804713804713804


### Neural Network

In [16]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  YOUR CODE GOES HERE

nn = tf.keras.models.Sequential()

number_input_features = len(X_train['Gender'])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

# First hidden layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Second hidden layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Output layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 80)                5924320   
_________________________________________________________________
dense_1 (Dense)              (None, 30)                2430      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 5,926,781
Trainable params: 5,926,781
Non-trainable params: 0
_________________________________________________________________


In [17]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

NameError: name 'X_test_scaled' is not defined