In [4]:
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password
import pandas as pd 


In [5]:
db_string = f"postgres://postgres:{db_password}@dataviz.cruszicqidok.us-west-1.rds.amazonaws.com:5432/dataviz"
engine = create_engine(db_string)

In [7]:
mans_df = pd.read_sql_table('Appointments',engine)
mans_df.head()

Unnamed: 0,AppointmentID,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,5642903,29872500000000.0,1,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,4,0
1,5642503,558998000000000.0,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,4,0
2,5642549,4262960000000.0,1,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,4,0
3,5642828,867951000000.0,1,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,4,0
4,5642494,8841190000000.0,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,1,4,0


## Preprocessing

### Viewing Data

In [2]:
#  Read the charity_data.csv.
data_df = pd.read_csv("processeddata.csv")
data_df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,29872500000000.0,5642903,1,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,4,0
1,558997800000000.0,5642503,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,4,0
2,4262962000000.0,5642549,1,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,4,0
3,867951200000.0,5642828,1,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,4,0
4,8841186000000.0,5642494,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,1,4,0


In [3]:
data_df.dtypes

PatientId         float64
AppointmentID       int64
Gender              int64
ScheduledDay       object
AppointmentDay     object
Age                 int64
Neighbourhood      object
Scholarship         int64
Hipertension        int64
Diabetes            int64
Alcoholism          int64
Handcap             int64
SMS_received        int64
No-show             int64
WeekDay             int64
TimeDelta           int64
dtype: object

### Datatype Changing

In [4]:
data_df['PatientId'] = data_df['PatientId'].astype('object')
data_df.dtypes

PatientId         object
AppointmentID      int64
Gender             int64
ScheduledDay      object
AppointmentDay    object
Age                int64
Neighbourhood     object
Scholarship        int64
Hipertension       int64
Diabetes           int64
Alcoholism         int64
Handcap            int64
SMS_received       int64
No-show            int64
WeekDay            int64
TimeDelta          int64
dtype: object

In [5]:
data_df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay,TimeDelta
0,29872500000000.0,5642903,1,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,1,4,0
1,558998000000000.0,5642503,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,1,4,0
2,4262960000000.0,5642549,1,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,1,4,0
3,867951000000.0,5642828,1,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,4,0
4,8841190000000.0,5642494,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,1,4,0


### Dropping Columns

In [6]:
data_df_test = data_df.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood', 'TimeDelta'],axis=1)
data_df_test.head()

Unnamed: 0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay
0,1,62,0,1,0,0,0,0,1,4
1,0,56,0,0,0,0,0,0,1,4
2,1,62,0,0,0,0,0,0,1,4
3,1,8,0,0,0,0,0,0,1,4
4,1,56,0,1,1,0,0,0,1,4


## Model Set up

### Data Split

In [7]:
X_ng = data_df_test.drop(['No-show'], axis = 1)
y_ng = data_df_test["No-show"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_ng, y_ng, test_size=0.33)

### Logistic Regression

In [9]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier.fit(X_train, y_train)
y_pred_log_reg = classifier.predict(X_test)
print(f"Logistic regression accuracy: {accuracy_score(y_test,y_pred_log_reg)} and f-score of: {f1_score(y_test, y_pred_log_reg)}")


Logistic regression accuracy: 0.7984317596095849 and f-score of: 0.8879199951216538


### Random Forest Regression

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred_rf = rf_model.predict(X_test)

print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred_rf)} and f-score of: {f1_score(y_test, y_pred_rf)}")

 Random forest predictive accuracy: 0.786203871250754 and f-score of: 0.879123263888889


## Without Age Data

In [11]:
data_df_test_no_age = data_df_test.drop(['Age'],axis=1)
data_df_test_no_age.head()

Unnamed: 0,Gender,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WeekDay
0,1,0,1,0,0,0,0,1,4
1,0,0,0,0,0,0,0,1,4
2,1,0,0,0,0,0,0,1,4
3,1,0,0,0,0,0,0,1,4
4,1,0,1,1,0,0,0,1,4


In [12]:
X = data_df_test_no_age.drop(['No-show'], axis = 1)
y = data_df_test_no_age["No-show"]
X_train_ng, X_test_ng, y_train_ng, y_test_ng = train_test_split(X, y, test_size=0.33)

## Logistic Regression

In [13]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)
classifier.fit(X_train_ng, y_train_ng)
y_pred_log_reg_ng = classifier.predict(X_test_ng)
print(f"Logistic regression accuracy: {accuracy_score(y_test,y_pred_log_reg)} and f-score of: {f1_score(y_test_ng, y_pred_log_reg_ng)}")


Logistic regression accuracy: 0.7984317596095849 and f-score of: 0.8862560688875996


## Random Forest

In [14]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_ng, y_train_ng)

# Evaluate the model
y_pred_rf = rf_model.predict(X_test_ng)

print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred_rf)} and f-score of: {f1_score(y_test_ng, y_pred_rf)}")

 Random forest predictive accuracy: 0.7971705872676427 and f-score of: 0.8860121049092132


### Neural Network

In [15]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  YOUR CODE GOES HERE

nn = tf.keras.models.Sequential()

number_input_features = len(X_train['Gender'])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

# First hidden layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Second hidden layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Output layer
#  YOUR CODE GOES HERE
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 80)                5924320   
_________________________________________________________________
dense_1 (Dense)              (None, 30)                2430      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 5,926,781
Trainable params: 5,926,781
Non-trainable params: 0
_________________________________________________________________


In [16]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

NameError: name 'X_test_scaled' is not defined