In [1]:
import os
import random
import time
import pickle

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

import torch
from torch import nn
from torch import nn, optim
from torch.utils.data import DataLoader
import torchtext

#df=pd.DataFrame(columns = ['text', 'rating'])

In [4]:
# for directory in ['./train/pos/', './train/neg/']:
#     for dirname, _, filenames in os.walk(directory):
#         for filename in filenames:
#             file = open(dirname + '/' + filename, encoding="utf8")
#             text = file.read()
#             s = file.name.replace(directory, '').replace('.txt', '')
#             ID = s[0:s.find('_')]
#             rating = s[s.find('_')+1:]
#             newRow = {'text': text, 'rating': rating}
#             df = pd.concat([df, pd.DataFrame([newRow])])

In [5]:
# for directory in ['./test/pos/', './test/neg/']:
#     for dirname, _, filenames in os.walk(directory):
#         for filename in filenames:
#             file = open(dirname + '/' + filename, encoding="utf8")
#             text = file.read()
#             s = file.name.replace(directory, '').replace('.txt', '')
#             ID = s[0:s.find('_')]
#             rating = s[s.find('_')+1:]
#             newRow = {'text': text, 'rating': rating}
#             df = pd.concat([df, pd.DataFrame([newRow])])

In [6]:
# df.to_csv ('train_data.csv', index= False )
# df.to_csv ('test_data.csv', index= False )

In [7]:
# df_test = pd.read_csv('test_data.csv')
# df_train = pd.read_csv('train_data.csv')

## Prepocessing of dataset

In [7]:
import gensim.downloader as api
from collections import Counter
import re
from torchtext.data import get_tokenizer

In [3]:
train_path = 'train_data.csv'
test_path = 'test_data.csv'

In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, path):
        self.df = pd.read_csv(path)
        self.df = self.df.sample(frac=1).reset_index(drop=True)
    def encoder(self, word):
        if word in word2idx.keys():
            return word2idx[word]
        return word2idx['lurum']
    def array_word2vec(self, arr):
        result = []
        for elem in arr:
            if elem not in word2vec:
                result.append(word2vec['unc'])
            else:
                result.append(word2vec[elem])
        return result
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        review = re.sub(r'\W', ' ', str(self.df.iloc[index]['text']))
        review = review.lower()
        review = re.sub(r'^br$', ' ', review)
        review = re.sub(r'\s+br\s+',' ',review)
        review = re.sub(r'\s+[a-z]\s+', ' ',review)
        review = re.sub(r'^b\s+', '', review)
        review = re.sub(r'\s+', ' ', review)
        
        rating = self.df.iloc[index]['rating']
        rating = rating - 1 if rating <= 4 else rating - 3
        return review, rating

In [5]:
from torchtext.vocab import GloVe

global_vectors = GloVe(name='840B', dim=300)

In [8]:
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

max_words = 500
embed_len = 300

tokenizer = get_tokenizer("basic_english")
from torchtext.data import get_tokenizer

def vectorize_batch(batch):
    X, Y = list(zip(*batch))
    X = [tokenizer(x) for x in X]
    X = [tokens+[""] * (max_words-len(tokens))  if len(tokens) < max_words else tokens[:max_words] for tokens in X]
    X_tensor = torch.zeros(len(batch), max_words, embed_len)
    for i, tokens in enumerate(X):
        X_tensor[i] = global_vectors.get_vecs_by_tokens(tokens)
    return X_tensor.mean(dim=1), torch.tensor(Y)

train_dataset, test_dataset  = Dataset(train_path), Dataset(test_path)
train_dataset, test_dataset = to_map_style_dataset(train_dataset), to_map_style_dataset(test_dataset)

train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=vectorize_batch)
test_loader  = DataLoader(test_dataset, batch_size=1024, collate_fn=vectorize_batch)

In [86]:
for X, Y in train_loader:
    print(X, Y)
    break

tensor([[-0.0050,  0.0219, -0.0453,  ..., -0.0281,  0.0181,  0.0277],
        [-0.0302,  0.0471, -0.1138,  ..., -0.0422,  0.0133,  0.0714],
        [-0.0120,  0.0416, -0.0291,  ..., -0.0155,  0.0098,  0.0400],
        ...,
        [-0.0146,  0.0382, -0.0310,  ..., -0.0171, -0.0011,  0.0206],
        [-0.0112,  0.0569, -0.0353,  ..., -0.0317,  0.0001,  0.0343],
        [-0.0169,  0.0348, -0.0211,  ..., -0.0133,  0.0104,  0.0102]]) tensor([ 6,  2, -1,  ..., -1,  6,  3])


In [9]:
from torch import nn
from torch.nn import functional as F

target_classes = 8

class EmbeddingClassifier(nn.Module):
    def __init__(self):
        super(EmbeddingClassifier, self).__init__()
        self.seq = nn.Sequential(
            nn.Linear(embed_len, 256),
            nn.ReLU(),

            nn.Linear(256,target_classes),
        )

    def forward(self, X_batch):
        x = self.seq(X_batch)
        return torch.log_softmax(x, dim=1)

In [10]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))

def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

#         if i%5==0:
        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [11]:
from torch.optim import Adam

epochs = 1
learning_rate = 1e-3

loss_fn = nn.CrossEntropyLoss()
embed_classifier = EmbeddingClassifier()
optimizer = Adam(embed_classifier.parameters(), lr=learning_rate)

TrainModel(embed_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [01:45<00:00,  4.24s/it]


Train Loss : 2.046
Valid Loss : 2.024
Valid Acc  : 0.234


In [12]:
def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
        Y_shuffled.append(Y)
    gc.collect()
    Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)

    return Y_shuffled.detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).detach().numpy()

Y_actual, Y_preds = MakePredictions(embed_classifier, test_loader)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

## Logistic Regression

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
import re

In [35]:
from sklearn import metrics
from sklearn.metrics import accuracy_score,roc_auc_score
def modelEvaluation(predictions):
    '''
    Print model evaluation to predicted result 
    '''
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [36]:
df_test = pd.read_csv('test_data.csv')
df_train = pd.read_csv('train_data.csv')

In [37]:
for i, review in enumerate(df_train['text']):
    review = re.sub(r'\W', ' ', review)
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+',' ',review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    df_train.loc[0, 'text'] = review

for i, review in enumerate(df_test['text']):
    review = re.sub(r'\W', ' ', review)
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+',' ',review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    df_test.loc[0, 'text'] = review

In [38]:
X_train = df_train['text']
y_train = df_train['rating']

X_test = df_test['text']
y_test = df_test['rating']

In [39]:
from sklearn.linear_model import LogisticRegression
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)

Number of features : 27272 

Show some feature names : 
 ['00' 'alternatively' 'baked' 'bothersome' 'centers' 'complicit' 'cuties'
 'disgraced' 'elsewhere' 'fat' 'gainey' 'hamburger' 'ideally' 'ive' 'leer'
 'martians' 'mower' 'opponents' 'picaresque' 'prude' 'repairs' 'saruman'
 'silverman' 'stands' 'talk' 'trenches' 'verify' 'wreak']


In [52]:
logreg = LogisticRegression(max_iter=1000, class_weight = 'balanced')

nb = BernoulliNB()


models = [logreg, nb]

i = 0
for model in models:
    model.fit(X_train_tfidf, y_train)
    predictions = model.predict(tfidf.transform(X_test))
    modelEvaluation(predictions)


Accuracy on validation set: 0.5653

Classification report : 
               precision    recall  f1-score   support

           1       0.74      0.68      0.71     10122
           2       0.45      0.50      0.47      4586
           3       0.47      0.48      0.48      4961
           4       0.50      0.55      0.53      5331
           7       0.49      0.55      0.52      4803
           8       0.51      0.47      0.49      5859
           9       0.45      0.49      0.47      4608
          10       0.68      0.63      0.65      9731

    accuracy                           0.57     50001
   macro avg       0.54      0.54      0.54     50001
weighted avg       0.57      0.57      0.57     50001


Confusion Matrix : 
 [[6905 1236  795  592  135  120  100  239]
 [ 965 2275  533  491  103   53   57  109]
 [ 602  619 2369  805  232  115   73  146]
 [ 403  467  676 2951  392  193  122  127]
 [  66  110  189  398 2658  580  379  423]
 [  93  101  169  267  895 2745  707  882]
 [  70

In [53]:
feature_names = np.array(tfidf.get_feature_names_out())
sorted_coef_index = logreg.coef_[0].argsort()
print('\nWith smallest coefficients :\n{}\n'.format(feature_names[sorted_coef_index[:34]]))
print('With largest coefficients : \n{}'.format(feature_names[sorted_coef_index[:-34:-1]]))


With smallest coefficients :
['great' 'but' 'best' 'excellent' 'also' 'good' 'love' 'very' 'pretty'
 'and' 'well' 'enjoyed' 'fun' 'little' 'perfect' 'story' 'quite'
 'definitely' 'today' 'it' 'nice' 'his' 'overall' 'played' 'although'
 'young' 'much' 'always' 'recommend' 'amazing' 'though' 'favorite'
 'wonderful' 'fantastic']

With largest coefficients : 
['worst' 'awful' 'bad' 'terrible' 'waste' 'avoid' 'money' 'even' 'ever'
 'horrible' 'this' 'no' 'boring' 'crap' 'stupid' 'garbage' 'nothing'
 'worse' 'minutes' 'ridiculous' 'acting' 'they' 'poor' 'pathetic' 'badly'
 'any' 'pointless' 'rubbish' 'would' 'trash' 'piece' 'should' 'dreadful']


In [55]:
pickle.dump(tfidf,open('tfidf.pkl','wb'))
pickle.dump(logreg,open('logisticRegression.pkl','wb'))