In [9]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import torch
import os
from torch import nn
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import seaborn as sns
from ray import tune

In [11]:
# import data from processed_data.csv for full set of small_set_processed_data.csv for3 city sample
df = pd.read_csv('stlouis_processed_data.csv')

# split into X and y
X = df.drop('condition', axis=1)
y = df['condition']

# convert from boolean to int
y = y.astype(int)
X = X.astype(float)

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# convert to torch tensors
X_train = torch.tensor(X_train.values)
X_test = torch.tensor(X_test.values)
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [12]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [13]:
# parameters
input_size = len(X.columns)
num_classes = len(y.unique())
num_epochs = 100
hidden_size = 500
num_epochs = 100
batch_size = 100
learning_rate = 0.001

# create model
model = NeuralNet(input_size, hidden_size, num_classes)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [15]:
def train_model(model, optimizer, criterion, X_train, y_train, batch_size):
    train_losses = []
    test_losses = []
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    for epoch in range(num_epochs):
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X.float())
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        # calculate train loss
        train_outputs = model(X_train.float())
        train_loss = criterion(train_outputs, y_train)
        train_losses.append(train_loss.item())
        # calculate test loss
        test_outputs = model(X_test.float())
        test_loss = criterion(test_outputs, y_test)
        test_losses.append(test_loss.item())
    return train_losses, test_losses


In [18]:
# train_losses = []
# test_losses = []
# accuracy_list = []
# test_predict = []

model = NeuralNet(input_size, hidden_size, num_classes)
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# train model
train_loss, test_loss = train_model(model, optimizer, criterion, X_train, y_train, batch_size)
# keep an array of predictions on test set
# test_predictions = model(X.float()).argmax(axis=1)
# test_predict.append(test_predictions)
# accuracy = (model(X_test.float()).argmax(axis=1) == y_test).sum().item() / len(y_test)
# accuracy_list.append(accuracy)
# train_losses.append(train_loss)
# test_losses.append(test_loss)
# save model
torch.save(model.state_dict(), 'stlouis_model.ckpt')

In [22]:
train_losses = []
test_losses = []
accuracy_list = []
accuracy = (model(X_test.float()).argmax(axis=1) == y_test).sum().item() / len(y_test)
accuracy_list.append(accuracy)
train_losses.append(train_loss)
test_losses.append(test_loss)
print(accuracy_list)

[0.5224594772414618]


In [20]:
# load model
model = NeuralNet(input_size, hidden_size, num_classes)
model.load_state_dict(torch.load('stlouis_model.ckpt'))

# convert to torch tensors
X = torch.tensor(X.values)
y = torch.tensor(y.values)

predict = model(X.float()).argmax(axis=1)
print(predict.shape)

# save predictions to a csv
predict_stl = pd.DataFrame(predict)
predict_stl.to_csv('stl_predictions.csv', index=False)

# count each occurence of unique value in predictions
print(predict_stl[0].value_counts())

torch.Size([83593])
2    81883
3     1710
Name: 0, dtype: int64


In [23]:
# append prediction column to original dataframe
df['prediction'] = predict_stl[0]

df.head()



Unnamed: 0,condition,longitude_coordinate,latitude_coordinate,native_introduced,native_naturally_occurring,prediction
0,2.0,-90.2817,38.653678,0,0,2
1,2.0,-90.281757,38.653682,1,0,2
2,2.0,-90.285143,38.64843,1,0,2
3,3.0,-90.280731,38.648149,1,0,2
4,2.0,-90.238763,38.717476,1,0,2


In [24]:
# save dataframe to csv
df.to_csv('stl_full_with_predictions.csv', index=False)