In [1]:
# UT-TOR-DATA-PT-01-2020-U-C Group Project 3
# Final project
# Neural Network. Model 1
# (c) Boris Smirnov

In [2]:
# Depedences and Constants
from sqlalchemy import create_engine
from config import PGPASSWORD
import pandas as pd

db_connect_str = f"postgresql://postgres:{PGPASSWORD}@project-3.c0wevqvgoxbl.us-east-2.rds.amazonaws.com:5432/project3"

In [3]:
# Connect to the database and fetch all the data
engine = create_engine(db_connect_str)
connection = engine.connect()
query = 'select * from everything order by ("Year", "FED Id")'
df = pd.read_sql(query, connection)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2209 entries, 0 to 2208
Columns: 142 entries, Year to employed...15+ by mode of transportation other methods FC108373
dtypes: float64(61), int64(80), object(1)
memory usage: 2.4+ MB


In [4]:
df.iloc[0:10, 19]

0    10
1    10
2    10
3    10
4    10
5    10
6    10
7    11
8    11
9    11
Name: Prov Id, dtype: int64

In [5]:
# One-hot encode province Ids
# First change numeric Geo Ids to Alpha Codes
province_dct = {
    10: "NL",
    11: "PE",
    12: "NS",
    13: "NB",
    24: "QC",
    35: "ON",
    46: "MB",
    47: "SK",
    48: "AB",
    59: "BC",
    60: "YT",
    61: "NT",
    62: "NU" 
}
df['Prov Id'] = df['Prov Id'].map(province_dct)

# Then encode province id as one-hot and replace old column new new columns
one_hot = pd.get_dummies(df['Prov Id'])
df = df.drop('Prov Id', axis=1)
df = df.join(one_hot)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2209 entries, 0 to 2208
Columns: 154 entries, Year to YT
dtypes: float64(61), int64(79), object(1), uint8(13)
memory usage: 2.4+ MB


In [6]:
# Convert pandas dataframe to numpy array
data = df.values
X = data[:, 9:]
y = data[:, 2] # winning party only, not popular vote

In [7]:
# Step 1: Label-encode data set
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y

array([3, 3, 3, ..., 3, 3, 4])

In [8]:
# Step 2: One-hot encoding
from keras.utils import to_categorical

one_hot_y = to_categorical(encoded_y)
one_hot_y

Using TensorFlow backend.


array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [9]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, one_hot_y, random_state=12)

In [10]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Normal neural network with 145 inputs, 100 hidden nodes, and 6 outputs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=146, activation='relu', input_dim=145))
model.add(Dense(units=30, activation='relu'))
model.add(Dense(units=6, activation='softmax'))

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 146)               21316     
_________________________________________________________________
dense_1 (Dense)              (None, 30)                4410      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 186       
Total params: 25,912
Trainable params: 25,912
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [14]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train,
    epochs=40,
    shuffle=True,
    verbose=2
)

Train on 1656 samples
Epoch 1/40
1656/1656 - 0s - loss: 0.9538 - accuracy: 0.6443
Epoch 2/40
1656/1656 - 0s - loss: 0.6160 - accuracy: 0.7579
Epoch 3/40
1656/1656 - 0s - loss: 0.5411 - accuracy: 0.7862
Epoch 4/40
1656/1656 - 0s - loss: 0.4850 - accuracy: 0.8062
Epoch 5/40
1656/1656 - 0s - loss: 0.4512 - accuracy: 0.8152
Epoch 6/40
1656/1656 - 0s - loss: 0.4145 - accuracy: 0.8339
Epoch 7/40
1656/1656 - 0s - loss: 0.3947 - accuracy: 0.8454
Epoch 8/40
1656/1656 - 0s - loss: 0.3636 - accuracy: 0.8641
Epoch 9/40
1656/1656 - 0s - loss: 0.3508 - accuracy: 0.8647
Epoch 10/40
1656/1656 - 0s - loss: 0.3368 - accuracy: 0.8714
Epoch 11/40
1656/1656 - 0s - loss: 0.3200 - accuracy: 0.8756
Epoch 12/40
1656/1656 - 0s - loss: 0.2990 - accuracy: 0.8853
Epoch 13/40
1656/1656 - 0s - loss: 0.2792 - accuracy: 0.8895
Epoch 14/40
1656/1656 - 0s - loss: 0.2699 - accuracy: 0.8979
Epoch 15/40
1656/1656 - 0s - loss: 0.2680 - accuracy: 0.8961
Epoch 16/40
1656/1656 - 0s - loss: 0.2476 - accuracy: 0.9040
Epoch 17/40

<tensorflow.python.keras.callbacks.History at 0x2a69692a7b8>

In [15]:
# Save the model
model.save("model1-146-30--40_trained.h5")

In [16]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

553/553 - 0s - loss: 0.8133 - accuracy: 0.7920
Loss: 0.8132654648361758, Accuracy: 0.7920433878898621


In [18]:
import numpy as np

# Trying to get prediction results
X_scaled = X_scaler.transform(X)
predictions = model.predict_classes(np.array(X_scaled))
parties = label_encoder.inverse_transform(predictions)
df['Predicted Pid'] = pd.Series(parties)
df[df['Winning Pid'] != df['Predicted Pid']].count()

Year             143
FED Id           143
Winning Pid      143
LIB result       143
CPC result       143
                ... 
PE               143
QC               143
SK               143
YT               143
Predicted Pid    143
Length: 155, dtype: int64

In [19]:
df.loc[df['Winning Pid'] != df['Predicted Pid'], ['Winning Pid', 'Predicted Pid']]

Unnamed: 0,Winning Pid,Predicted Pid
13,NDP,LIB
14,NDP,LIB
32,LIB,BQ
48,BQ,LIB
49,LIB,BQ
...,...,...
2143,CPC,NDP
2144,CPC,NDP
2174,LIB,CPC
2181,GRN,NDP


In [20]:
df['Winning Pid'].value_counts() # <-- Poll based prediction

LIB    862
CPC    830
NDP    269
BQ     238
OTH      5
GRN      5
Name: Winning Pid, dtype: int64

In [21]:
df['Predicted Pid'].value_counts() # <-- MLP NN prediction

LIB    861
CPC    829
NDP    268
BQ     246
GRN      3
OTH      2
Name: Predicted Pid, dtype: int64

In [22]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

553/553 - 0s - loss: 0.8133 - accuracy: 0.7920
Loss: 0.8132654648361758, Accuracy: 0.7920433878898621
