In [1]:
from model import Classifier
from sklearn.model_selection import train_test_split
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'Using device: {device}')

Using device: cpu


In [3]:
with open('topics.json') as f:
    topics = json.load(f)

In [4]:
df = pd.read_json('../datasets/combined/combined_processed_spacy.json')
df.head()

Unnamed: 0,article_embedding,topic
0,[-1.77190983e+00 7.77235508e-01 -1.44897866e+...,U.S. NEWS
1,[-2.6128807 0.3425455 -1.8239442 0.828039...,U.S. NEWS
2,[-1.2731742e+00 1.0092950e+00 -1.7676051e+00 ...,COMEDY
3,[-9.87586260e-01 5.54175138e-01 -1.56290960e+...,PARENTING
4,[-2.36455750e+00 -3.52101356e-01 -1.48058534e+...,U.S. NEWS


In [5]:
def vector_to_tensor(vector):
    cleaned_vector = vector.replace('\n', '').replace('[', '').replace(']', '').replace('  ', ' ')
    
    return torch.tensor(np.fromstring(cleaned_vector, dtype=np.float32, sep=' '))


In [6]:
df['article_embedding'] = df['article_embedding'].apply(vector_to_tensor)
df['topic'] = df['topic'].apply(lambda x: topics.index(x))

In [7]:
X = torch.stack(df['article_embedding'].values.tolist()).unsqueeze(1)
y = torch.tensor(df['topic'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [8]:
model1 = Classifier().to(device)

batch_size = 64
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model1.parameters(), lr=0.1)
num_epochs = 20

model1.train_model(X_train, y_train, batch_size, device, criterion, optimizer, num_epochs)

predictions = model1.predict(X_val, batch_size)
accuracy = np.mean(predictions == np.array(y_val)) * 100

print(f'Validation accuracy: {accuracy:.2f}')

Epoch 1/20 | Loss: 2.6637 | Accuracy: 30.55%
Epoch 2/20 | Loss: 1.8554 | Accuracy: 37.72%
Epoch 3/20 | Loss: 2.2456 | Accuracy: 39.87%
Epoch 4/20 | Loss: 2.4231 | Accuracy: 40.52%
Epoch 5/20 | Loss: 2.4138 | Accuracy: 41.14%
Epoch 6/20 | Loss: 2.3086 | Accuracy: 41.77%
Epoch 7/20 | Loss: 1.9040 | Accuracy: 41.99%
Epoch 8/20 | Loss: 2.2864 | Accuracy: 42.09%
Epoch 9/20 | Loss: 2.4905 | Accuracy: 42.48%
Epoch 10/20 | Loss: 1.9006 | Accuracy: 42.59%
Epoch 11/20 | Loss: 1.9659 | Accuracy: 42.92%
Epoch 12/20 | Loss: 1.8754 | Accuracy: 43.26%
Epoch 13/20 | Loss: 1.7696 | Accuracy: 43.61%
Epoch 14/20 | Loss: 1.5424 | Accuracy: 43.95%
Epoch 15/20 | Loss: 1.8606 | Accuracy: 44.06%
Epoch 16/20 | Loss: 1.8058 | Accuracy: 44.21%
Epoch 17/20 | Loss: 1.8261 | Accuracy: 44.42%
Epoch 18/20 | Loss: 2.1716 | Accuracy: 44.46%
Epoch 19/20 | Loss: 1.9330 | Accuracy: 44.76%
Epoch 20/20 | Loss: 1.7624 | Accuracy: 44.69%
Validation accuracy: 46.81


In [9]:
model2 = Classifier().to(device)

batch_size = 64
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model2.parameters(), lr=0.05)
num_epochs = 20

model2.train_model(X_train, y_train, batch_size, device, criterion, optimizer, num_epochs)

predictions = model2.predict(X_val, batch_size)
accuracy = np.mean(predictions == np.array(y_val)) * 100

print(f'Validation accuracy: {accuracy:.2f}')

Epoch 1/20 | Loss: 1.9619 | Accuracy: 38.29%
Epoch 2/20 | Loss: 2.1761 | Accuracy: 44.44%
Epoch 3/20 | Loss: 1.8516 | Accuracy: 46.08%
Epoch 4/20 | Loss: 1.8255 | Accuracy: 47.32%
Epoch 5/20 | Loss: 2.3309 | Accuracy: 47.80%
Epoch 6/20 | Loss: 2.0937 | Accuracy: 48.39%
Epoch 7/20 | Loss: 1.7838 | Accuracy: 48.80%
Epoch 8/20 | Loss: 2.0517 | Accuracy: 49.18%
Epoch 9/20 | Loss: 2.1222 | Accuracy: 49.39%
Epoch 10/20 | Loss: 1.6655 | Accuracy: 49.60%
Epoch 11/20 | Loss: 1.4790 | Accuracy: 49.92%
Epoch 12/20 | Loss: 1.8687 | Accuracy: 50.25%
Epoch 13/20 | Loss: 1.9030 | Accuracy: 50.41%
Epoch 14/20 | Loss: 1.5378 | Accuracy: 50.61%
Epoch 15/20 | Loss: 2.1077 | Accuracy: 50.64%
Epoch 16/20 | Loss: 1.7338 | Accuracy: 50.79%
Epoch 17/20 | Loss: 1.9461 | Accuracy: 50.97%
Epoch 18/20 | Loss: 1.9397 | Accuracy: 51.02%
Epoch 19/20 | Loss: 2.0615 | Accuracy: 50.99%
Epoch 20/20 | Loss: 1.6196 | Accuracy: 51.22%
Validation accuracy: 51.79


In [10]:
model3 = Classifier().to(device)

batch_size = 64
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model3.parameters(), lr=0.01)
num_epochs = 50

model3.train_model(X_train, y_train, batch_size, device, criterion, optimizer, num_epochs)

predictions = model3.predict(X_val, batch_size)
accuracy = np.mean(predictions == np.array(y_val)) * 100

print(f'Validation accuracy: {accuracy:.2f}')

Epoch 1/50 | Loss: 2.1552 | Accuracy: 34.99%
Epoch 2/50 | Loss: 1.8917 | Accuracy: 43.30%
Epoch 3/50 | Loss: 1.6838 | Accuracy: 46.05%
Epoch 4/50 | Loss: 2.0227 | Accuracy: 47.34%
Epoch 5/50 | Loss: 1.8167 | Accuracy: 48.10%
Epoch 6/50 | Loss: 1.8681 | Accuracy: 48.74%
Epoch 7/50 | Loss: 1.6747 | Accuracy: 49.17%
Epoch 8/50 | Loss: 1.6301 | Accuracy: 49.60%
Epoch 9/50 | Loss: 2.2906 | Accuracy: 49.89%
Epoch 10/50 | Loss: 2.0510 | Accuracy: 50.12%
Epoch 11/50 | Loss: 1.8762 | Accuracy: 50.51%
Epoch 12/50 | Loss: 2.1187 | Accuracy: 50.56%
Epoch 13/50 | Loss: 1.7168 | Accuracy: 50.85%
Epoch 14/50 | Loss: 1.5079 | Accuracy: 51.01%
Epoch 15/50 | Loss: 1.8556 | Accuracy: 51.06%
Epoch 16/50 | Loss: 1.7787 | Accuracy: 51.27%
Epoch 17/50 | Loss: 1.7735 | Accuracy: 51.46%
Epoch 18/50 | Loss: 1.5691 | Accuracy: 51.43%
Epoch 19/50 | Loss: 1.7224 | Accuracy: 51.58%
Epoch 20/50 | Loss: 1.4488 | Accuracy: 51.52%
Epoch 21/50 | Loss: 1.8425 | Accuracy: 51.78%
Epoch 22/50 | Loss: 1.7218 | Accuracy: 51.7

In [11]:
predictions = model2.predict(X_test, batch_size)
accuracy = np.mean(predictions == np.array(y_test)) * 100

print(f'Test accuracy: {accuracy:.2f}')

Test accuracy: 52.20


In [12]:
torch.save(model2.state_dict(), 'twitter_dash/model.pt')