### Model Set Up

In [2]:
pip install tensorflow

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os
import tensorflow as tf
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score




### Model Training 

In [4]:
d = pd.read_csv(r'C:\Users\irm20\Documents\UCONN\Senior_Sem1\STAT_Thesis\NY_NN.csv')

In [5]:
df = d.drop('State', axis = 1)

In [6]:
#Changing class from string to integer 
df['Class'] = df['Class'].replace('North', 0)
df['Class'] = df['Class'].replace('South', 1)
df['Class'] = df['Class'].replace('Midwest', 2)
df['Class'] = df['Class'].replace('West', 3)
df

Unnamed: 0,seconds,Class
0,8613,3
1,8616,0
2,8651,0
3,8672,0
4,8726,0
...,...,...
478486,36798,1
478487,38322,0
478488,38455,0
478489,39048,3


In [7]:
# Train, test, and split
X = df.drop('Class', axis = 1)
y = df['Class']

In [8]:
# convert to numpy arrays
X = X.values
y = y.values

In [9]:
kf = KFold(5, shuffle=True, random_state = 42)

oos_y = []
oos_pred = []

fold = 0
for train, test in kf.split(X):
    fold += 1
    print(f'Fold #{fold}')
    
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(20, input_dim = X.shape[1], activation = 'softmax'))
    model.add(Dense(10, activation = 'softmax'))
    model.add(Dense(4, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes=4)
    y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes=4)
    
    model.fit(X_train, y_train_one_hot, validation_data=(X_test,y_test_one_hot), verbose=0, epochs = 10)
    
    pred = model.predict(X_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)
    

    pred_classes = np.argmax(pred, axis=1)
    
    score = accuracy_score(y_test, pred_classes)
    print(f'Fold score (Accuracy): {score}')

oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)

oos_pred_classes = np.argmax(oos_pred, axis=1)


score = accuracy_score(oos_y, oos_pred_classes)
print(f'Final out-of-sample score (Accuracy): {score}')

oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oos_DF = pd.concat([df, oos_y, oos_pred], axis=1)

Fold #1




Fold score (Accuracy): 0.68605732557289
Fold #2
Fold score (Accuracy): 0.6863779807310497
Fold #3
Fold score (Accuracy): 0.6865660724362056
Fold #4
Fold score (Accuracy): 0.6856256139104265
Fold #5
Fold score (Accuracy): 0.6845493113753683
Final out-of-sample score (Accuracy): 0.685835261269282


In [16]:
# with individual class weights 
from sklearn.utils.class_weight import compute_class_weight

kf = KFold(5, shuffle=True, random_state=1)

oos_y = []
oos_pred = []

fold = 0
for train, test in kf.split(X):
    fold += 1
    print(f'Fold #{fold}')

    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    # Calculate class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = dict(enumerate(class_weights))

    model = Sequential()
    model.add(Dense(50, input_dim=X.shape[1], activation='softmax'))
    model.add(Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes=4)
    y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes=4)

    # Training the model with class weights
    model.fit(X_train, y_train_one_hot, validation_data=(X_test, y_test_one_hot),
              class_weight=class_weights_dict, verbose=0, epochs=10)

    pred = model.predict(X_test)

    oos_y.append(y_test)
    oos_pred.append(pred)

    pred_classes = np.argmax(pred, axis=1)

    score = accuracy_score(y_test, pred_classes)
    print(f'Fold score (Accuracy): {score}')

oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)

oos_pred_classes = np.argmax(oos_pred, axis=1)

score = accuracy_score(oos_y, oos_pred_classes)
print(f'Final out-of-sample score (Accuracy): {score}')

oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oos_DF = pd.concat([df, oos_y, oos_pred], axis=1)

Fold #1
Fold score (Accuracy): 0.13518427569776068
Fold #2
Fold score (Accuracy): 0.07383644381282785
Fold #3
Fold score (Accuracy): 0.07279148989529562
Fold #4
Fold score (Accuracy): 0.07305272837467867
Fold #5
Fold score (Accuracy): 0.10728541871303475
Final out-of-sample score (Accuracy): 0.09243016065087954


In [7]:
# Logistic regression as baseline comparision to neural network 
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
lr.fit(X_train, y_train)
y_pred = model.predict(X_test)
lr.score(X_test,y_test)



0.68605732557289