In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('./train/in.tsv', sep='\t', nrows=5000, header=None)

df2 = pd.read_csv('./train/expected.tsv', sep='\t', nrows=5000, header=None)

df1['score'] = pd.Series(df2[0])

df1.rename(columns = {0:'text'}, inplace = True) 

In [3]:
dev_df1 = pd.read_csv('./dev-0/in.tsv', sep='\t', nrows=2000, header=None)

dev_df2 = pd.read_csv('./dev-0/expected.tsv', sep='\t', nrows=2000, header=None)

dev_df1['score'] = pd.Series(dev_df2[0])

dev_df1.rename(columns = {0:'text'}, inplace = True)

In [4]:
import nltk
import re
nltk.download('punkt')

df1.loc[:,"text"] = df1.text.apply(lambda x : str.lower(x))

df1.loc[:,"text"] = df1.text.apply(lambda x : " ".join(re.findall('[\w]+',x)))

df1.loc[:,"tokenized_text"] = df1.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\miki-\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
dev_df1.loc[:,"text"] = dev_df1.text.apply(lambda x : str.lower(x))

dev_df1.loc[:,"text"] = dev_df1.text.apply(lambda x : " ".join(re.findall('[\w]+',x)))

dev_df1.loc[:,"tokenized_text"] = dev_df1.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [6]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
from gensim import corpora
my_dict = corpora.Dictionary(df1['tokenized_text'])

In [8]:
VOCAB_SIZE = len(my_dict)
NUM_LABELS = 2

def make_bow_vector(review_dict, sentence):
    vec = torch.zeros(VOCAB_SIZE, dtype=torch.float64, device=device)
    for word in sentence:
        vec[review_dict.token2id[word]] += 1
    return vec.view(1, -1).float()

In [9]:
def make_target(label):
    if label == 0:
        return torch.tensor([0], dtype=torch.long, device=device)
    elif label == 1:
        return torch.tensor([1], dtype=torch.long, device=device)

In [10]:
class BoWClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [11]:
bow_nn_model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)
bow_nn_model.to(device)

loss_function = nn.NLLLoss()

optimizer = optim.SGD(bow_nn_model.parameters(), lr=0.01)

In [12]:
X_train, X_test, Y_train, Y_test = df1.loc[:,['tokenized_text']], dev_df1['tokenized_text'], df1.loc[:,['score']], dev_df1['score']

In [None]:
import time
start_time = time.time()

for epoch in range(100):
    for index, row in X_train.iterrows():
        bow_nn_model.zero_grad()

        bow_vec = make_bow_vector(my_dict, row['tokenized_text'])
        target = make_target(Y_train['score'][index])

        probs = bow_nn_model(bow_vec)

        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
print("Total time: " + str(time.time() - start_time))

In [None]:
from sklearn.metrics import classification_report
bow_nn_predictions = []
original_lables = []
start_time = time.time()
with torch.no_grad():
    for index, row in X_test.iterrows():
        bow_vec = make_bow_vector(my_dict, row['tokenized_text'])
        probs = bow_nn_model(bow_vec)
        bow_nn_predictions.append(torch.argmax(probs, dim=1).cpu().numpy()[0])
        original_lables.append(make_target(Y_test['score'][index]).cpu().numpy()[0])
print(classification_report(original_lables,bow_nn_predictions))
print("Total time predict: " + str(time.time() - start_time))

In [None]:
dev2_df = pd.read_csv('./dev-1/in.tsv', sep='\t', header=None)
dev2_df.rename(columns = {0:'text'}, inplace = True)
dev2_df.loc[:,"text"] = dev_df1.text.apply(lambda x : str.lower(x))
dev2_df.loc[:,"text"] = dev_df1.text.apply(lambda x : " ".join(re.findall('[\w]+',x)))
dev2_df.loc[:,"tokenized_text"] = dev_df1.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [None]:
f=open('dev-1/out.tsv','w')
X_test2, Y_test2 = dev2_df[:,['tokenized_text']], dev2_df[:,['score']]
with torch.no_grad():
    for index, row in X_test2.iterrows():
        bow_vec = make_bow_vector(my_dict, row['tokenized_text'])
        probs = bow_nn_model(bow_vec)
        y_pred = torch.argmax(probs, dim=1).cpu().numpy()[0]
        f.write(str(int(y_pred)) + '\n')
f.close()

In [None]:
test_df = pd.read_csv('./dev-1/in.tsv', sep='\t', header=None)
test_df.rename(columns = {0:'text'}, inplace = True)
test_df.loc[:,"text"] = test.text.apply(lambda x : str.lower(x))
test_df.loc[:,"text"] = test.text.apply(lambda x : " ".join(re.findall('[\w]+',x)))
test_df.loc[:,"tokenized_text"] = test.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [None]:
f=open('test-A/out.tsv','w')
X_testA, Y_testA = test_df[:,['tokenized_text']], test_df[:,['score']]
with torch.no_grad():
    for index, row in X_testA.iterrows():
        bow_vec = make_bow_vector(my_dict, row['tokenized_text'])
        probs = bow_nn_model(bow_vec)
        y_pred = torch.argmax(probs, dim=1).cpu().numpy()[0]
        f.write(str(int(y_pred)) + '\n')
f.close()