In [20]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lzh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
# Load the dataset
data_path = '../datasets/learning-agency-lab-automated-essay-scoring-2/train.csv'
test_path = '../datasets/learning-agency-lab-automated-essay-scoring-2/test.csv'
data = pd.read_csv(data_path)
test = pd.read_csv(test_path)
data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [18]:
# text preprocessing
import re
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = text.lower() # lowercase text
    text = text.replace('\n', ' ') # remove new line
    text = text.replace('&nbsp;', ' ') # remove html space
    return text

data['full_text'] = data['full_text'].apply(clean_text)
test['full_text'] = test['full_text'].apply(clean_text)

data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,many people have car where they live the thing...,3
1,000fe60,i am a scientist at nasa that is discussing th...,3
2,001ab80,people always wish they had the same technolog...,4
3,001bdc0,we all heard about venus the planet without al...,4
4,002ba53,dear state senator this is a letter to argue ...,3


In [24]:
from nltk.corpus import stopwords
# Tokenization
stop_words = set(stopwords.words('english'))

def tokenize(text):
    tokens = text.split()
    # filter out the stopwords
    tokens = [word for word in tokens if word not in stop_words]
    processed_tokens = ' '.join(tokens)
    return processed_tokens

data['full_text'] = data['full_text'].apply(tokenize)
test['full_text'] = test['full_text'].apply(tokenize)

data.head(5)

Unnamed: 0,essay_id,full_text,score
0,000d118,many people car live thing dont know use car a...,3
1,000fe60,scientist nasa discussing face mars explaining...,3
2,001ab80,people always wish technology seen movies best...,4
3,001bdc0,heard venus planet without almost oxygen earth...,4
4,002ba53,dear state senator letter argue favor keeping ...,3


In [25]:
# train test split
from sklearn.model_selection import train_test_split
X = data['full_text']
y = data['score']

# vectorize the text
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# transform to tensor
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

X_train_tensor.shape, y_train_tensor.shape

(torch.Size([13845, 76499]), torch.Size([13845, 1]))

In [39]:
# Build the model
class NNModel (nn.Module):
    def __init__(self):
        super(NNModel, self).__init__()
        self.fc1 = nn.Linear(X_train_tensor.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [40]:
# Init the model, loss function and optimizer
model = NNModel()

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# train epochs

epochs = 20
batch_size = 32

for epoch in range (epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]
        
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        
        
    print(f'Epoch: {epoch + 1} Loss: {loss.item()}')

Epoch: 1 Loss: 0.40416234731674194
Epoch: 2 Loss: 0.2436501383781433


In [None]:
# evaluate the model
with torch.no_grad():
    model.eval()
    y_pred = model(X_test_tensor)
    loss = criterion(y_pred, y_test_tensor)
    print(f'Loss: {loss.item()}')