In [2]:
import sklearn
import torch
from torch import nn
import csv
import string
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn.model_selection import train_test_split

In [3]:
import sys 
import os

# Import utility functions
sys.path.append(os.path.relpath("../python_files/"))
import util

In [4]:
torch.__version__

'2.1.0'

### Load in Data

In [5]:
f = open("/Users/youzezheng/Desktop/Team-TBD/input/mbti_1.csv")

data = []
for l in csv.reader(f, delimiter=','):
    if l == ['type', 'posts']: continue
    data.append(l)

f.close()

In [6]:
counts = util.get_wordCounts(data) # slow as stemmer is used

In [409]:
words = [x[1] for x in counts[:2200]] # use the top 2.2K words out of 31K

In [410]:
# Sentiment Analysis
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [411]:
punctuation = set(string.punctuation)
stemmer = PorterStemmer()

In [412]:
# Encode words into wordvec using BoW model with stemmer
def feature(datum):
    feat = [0]*len(words)
    r = ''.join(c for c in datum.lower() if not c in punctuation)
    for w in r.split():
        w = stemmer.stem(w)
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [413]:
X_raw = [feature(post) for _,post in data] # slow as stemmer is used, plus the posts are long - I think

In [414]:
y_raw = [type for type,post in data]

In [415]:
# Encode y into numerical values as tensorflow does not support str conversion
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_raw)
le.classes_

array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
       'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
      dtype='<U4')

In [416]:
y_raw = [le.transform([y])[0] for y in y_raw]

In [417]:
X_raw, y_raw = np.array(X_raw), np.array(y_raw)

In [418]:
# Turn data into tensors
X = torch.from_numpy(X_raw).type(torch.float)
y = torch.from_numpy(y_raw).type(torch.LongTensor)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.2,
                                                     random_state=42)
len(X_train), len(X_test), len(y_train), len(y_test)

(6940, 1735, 6940, 1735)

In [419]:
X[:5], y[:5]

(tensor([[ 9., 10., 19.,  ...,  0.,  0.,  1.],
         [55., 30., 40.,  ...,  0.,  0.,  1.],
         [20., 23., 18.,  ...,  0.,  0.,  1.],
         [37., 29., 35.,  ...,  0.,  0.,  1.],
         [23., 30., 24.,  ...,  0.,  0.,  1.]]),
 tensor([ 8,  3, 11, 10,  2]))

In [420]:
# Device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [421]:
len(X[0])

2501

In [432]:
NUM_CLASSES = len(le.classes_) # number of MBTI types in our dataset
NUM_FEATURES = len(X[0])
class MBTIModel(nn.Module):
    def __init__(self, input_features, output_features, hidden_units=8):
        """Initializes all required hyperparameters for a multi-class classification model.

        Args:
            input_features (int): Number of input features to the model.
            out_features (int): Number of output features of the model
              (how many classes there are).
            hidden_units (int): Number of hidden units between layers, default 8.
        """
        super().__init__()
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(in_features=input_features, out_features=hidden_units),
            # nn.LSTM(input_features, output_features, 5),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            # nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=output_features)
        )
    
    def forward(self, x):
        return self.linear_layer_stack(x)

# Create an instance of MBTIModel and sent it to target device
model = MBTIModel(input_features=NUM_FEATURES,
                  output_features=NUM_CLASSES,
                  hidden_units=35).to(device)
model

MBTIModel(
  (linear_layer_stack): Sequential(
    (0): Linear(in_features=2501, out_features=35, bias=True)
    (1): Linear(in_features=35, out_features=35, bias=True)
    (2): Linear(in_features=35, out_features=16, bias=True)
  )
)

In [433]:
# Create a loss function and optimizer for a multi-class model
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=0.1) # could try SGD too

In [434]:
# Perform a single forward pass on data
model(X_train.to(device))[:5]

tensor([[-0.0287, -0.2690, -0.4137, -0.6817, -0.1007, -0.4945, -0.3480,  0.0864,
         -0.3105, -0.2135, -0.1478, -0.1605, -0.0917, -0.7096,  0.6299,  0.0422],
        [ 0.0079, -0.1958, -0.5976, -0.9262,  0.2049, -0.6128, -0.1823, -0.1436,
         -0.2891, -0.4590, -0.4136, -0.2575,  0.0351, -0.8511,  0.6886,  0.1412],
        [-0.1356, -0.1923, -0.2864, -0.6656, -0.3008, -0.3839, -0.1643,  0.2958,
         -0.4466, -0.1851,  0.0525, -0.1569, -0.1953, -0.7008,  0.3427,  0.0913],
        [ 0.0020, -0.4670, -0.6625, -1.1860, -0.0717, -0.5425, -0.4724,  0.0564,
         -0.7502, -0.4842, -0.3550,  0.1717, -0.2793, -1.0026,  0.9846,  0.0066],
        [ 0.0264, -0.2010, -0.1864, -0.3728, -0.3569, -0.4630,  0.0425,  0.0987,
         -0.0767, -0.3411, -0.1415, -0.2618, -0.0334, -0.4176,  0.3338,  0.1463]],
       grad_fn=<SliceBackward0>)

In [435]:
model(X_train.to(device))[0].shape, NUM_CLASSES # confirm shapes match

(torch.Size([16]), 16)

In [436]:
# Make prediction logits with model
y_logits = model(X_test.to(device))

# Perform softmax calculation on logits across dimension 1 to get prediction probabilities
y_pred_probs = torch.softmax(y_logits, dim=1)
print(y_logits[:5])
print(y_pred_probs[:5])

tensor([[-0.0300, -0.0225, -0.6366, -0.7175, -0.1303, -0.3949, -0.3059,  0.0781,
         -0.3546, -0.3503, -0.1587,  0.0295, -0.1413, -0.5800,  0.4380,  0.1991],
        [-0.3721, -0.1220, -0.6799, -1.2105,  0.3457, -0.4200, -0.0728,  0.0071,
         -0.4094, -0.4222, -0.2629, -0.3122,  0.1463, -1.2966,  0.3303, -0.1823],
        [ 0.1758, -0.4490, -0.2295, -0.4478, -0.2209, -0.4565, -0.2272,  0.2732,
          0.0018, -0.4225, -0.0256,  0.2636, -0.1290, -0.3790,  0.3513, -0.0247],
        [ 0.6173, -0.2009, -0.8345, -0.6599, -0.4933, -0.6949, -0.2842, -0.2516,
         -0.0709, -0.7576, -0.8118,  0.1931,  0.1286, -0.1322,  0.7799,  0.0592],
        [-0.0092, -0.4654, -0.4514, -0.8745,  0.1315, -0.7668, -0.1422,  0.0162,
          0.0057, -0.4463, -0.3161, -0.1427,  0.0524, -0.7696,  0.6787, -0.2171]],
       grad_fn=<SliceBackward0>)
tensor([[0.0702, 0.0707, 0.0383, 0.0353, 0.0635, 0.0487, 0.0533, 0.0782, 0.0507,
         0.0510, 0.0617, 0.0745, 0.0628, 0.0405, 0.1121, 0.0883],
    

In [437]:
# Sum the first sample output of the softmax activation function
torch.sum(y_pred_probs[0])

tensor(1., grad_fn=<SumBackward0>)

In [438]:
# Which class does the model predict
print(y_pred_probs[0])
print(torch.argmax(y_pred_probs[0]))

tensor([0.0702, 0.0707, 0.0383, 0.0353, 0.0635, 0.0487, 0.0533, 0.0782, 0.0507,
        0.0510, 0.0617, 0.0745, 0.0628, 0.0405, 0.1121, 0.0883],
       grad_fn=<SelectBackward0>)
tensor(14)


In [439]:
le.inverse_transform([torch.argmax(y_pred_probs[0])]) # predict actual label

array(['ISTJ'], dtype='<U4')

In [440]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct/len(y_pred))*100
    return acc

### Create a training and testing loop

In [441]:
# Fit the model
torch.manual_seed(42)

# Set the number of epochs
epochs = 120

# Put data on the target device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

prev_test_loss = float('inf')
for epoch in range(epochs):
    ### Training
    model.train()
    
    # Forward pass
    y_logits = model(X_train) # model outputs raw logits
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
    
    # Calculate the loss/accuracy
    loss = loss_fn(y_logits, y_train)
    acc = accuracy_fn(y_true=y_train,
                      y_pred=y_pred)
    
    # Optimizer zero grad
    optimizer.zero_grad()
    
    # Backprop
    loss.backward()
    
    # Optimizer step
    optimizer.step()
    
    ### Testing
    model.eval()
    with torch.inference_mode():
        # Forward pas
        test_logits = model(X_test)
        test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)
        
        # Calculate the loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test)
        test_acc = accuracy_fn(y_true=y_test,
                               y_pred=test_pred)
        # if test_loss <= prev_test_loss: prev_test_loss = test_loss
        # else:
        #     print(f"Stoping Epoch: {epoch+1} | Loss: {loss:.5f}, Acc: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:2f}%")
        #     break
    # Print out what's happening
    if epoch % 10 == 9:
        print(f"Epoch: {epoch+1} | Loss: {loss:.5f}, Acc: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:2f}%")

Epoch: 10 | Loss: 1420.18225, Acc: 21.11% | Test loss: 1923.36633, Test acc: 2.593660%
Epoch: 20 | Loss: 1184.35339, Acc: 2.32% | Test loss: 787.04443, Test acc: 12.276657%
Epoch: 30 | Loss: 586.90234, Acc: 2.09% | Test loss: 392.07782, Test acc: 4.322767%
Epoch: 40 | Loss: 357.87659, Acc: 18.62% | Test loss: 170.29587, Test acc: 20.000000%
Epoch: 50 | Loss: 183.95209, Acc: 20.98% | Test loss: 120.40254, Test acc: 25.706052%
Epoch: 60 | Loss: 61.55637, Acc: 40.32% | Test loss: 75.43159, Test acc: 26.282421%
Epoch: 70 | Loss: 23.27946, Acc: 55.55% | Test loss: 48.19202, Test acc: 29.106628%
Epoch: 80 | Loss: 13.36421, Acc: 58.86% | Test loss: 22.38182, Test acc: 43.746398%
Epoch: 90 | Loss: 7.13268, Acc: 68.30% | Test loss: 15.07937, Test acc: 53.429395%
Epoch: 100 | Loss: 4.07195, Acc: 75.59% | Test loss: 12.85288, Test acc: 54.293948%
Epoch: 110 | Loss: 2.51904, Acc: 80.06% | Test loss: 11.17986, Test acc: 54.985591%
Epoch: 120 | Loss: 1.53566, Acc: 84.55% | Test loss: 10.41117, Test 