<h3>Imports</h3>

In [1]:
from bs4 import BeautifulSoup
import xgboost as xgb
import re
import nltk
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
import unicodedata
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Add the train and test dataset files in the same folder as this .ipynb file for this to work or provide full path to these 2 files.
train_filename = "Trainset.csv"
test_filename = "Testset without answer.csv"

<h3>Preprocessing Methods</h3>

In [2]:
def preprocess_all_texts(texts):
    s = []
    textList = texts.tolist()
    for text in textList:
        s_list = [preprocess_one_text(text)]
        str_ = ' '.join(s_list)   
        s.append(str_)     
    return s

def preprocess_one_text(text):
    text = remove_accented_chars(text)
    text = remove_special_characters(text)
    text = remove_extra_whitespace_tabs(text)
    text = remove_numbers(text)
    text = to_lowercase(text)
    text = get_stem(text)
    text = remove_stopwords(text)
    return text
    
def remove_extra_whitespace_tabs(text):
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()    

def to_lowercase(text):
    return text.lower()

def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

def remove_special_characters(text):
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

def remove_numbers(text):
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)

def get_stem(text):
    stemmer = nltk.porter.PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    t = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(t)    
    return text

<h3>Helper Methods to make code cleaner</h3>

In [3]:
# Get the texts and labels from the training data as a tuple of 2 arrays
def get_train_data(filename):
    with open(filename,'r',encoding="ISO-8859-1") as file:
        csvDict = csv.DictReader(file)
        labels, texts = [], []
        for row in csvDict:
            labels.append(row["rating"])
            texts.append(row["review"])
        labels = np.array(labels)
        texts = np.array(texts)
        return (texts, labels)
    
# Get the texts and ids from the test data as a tuple of 2 arrays
def get_test_data(filename):
     with open(filename,'r',encoding="ISO-8859-1") as file:
        csvDict = csv.DictReader(file)
        texts, ids = [], []
        for row in csvDict:
            ids.append(row["id"])
            texts.append(row["review"])
        ids = np.array(ids)
        texts = np.array(texts)
        return (texts, ids)

# Write the given ids array and predictions array to the given filename
def write_csv(ids, pred, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["id", "rating"])
        rows = np.vstack((ids, pred)).T
        writer.writerows(rows)
        print("Wrote to a CSV file!")

# Encode the labels(good, average, poor) to (2, 1, 0)
def encode(labels):
    labels = np.where(labels=="poor", 0, labels) 
    labels = np.where(labels=="average", 1, labels) 
    labels = np.where(labels=="good", 2, labels) 
    labels = labels.astype(np.int64)
    return labels

# Decode the labels(0, 1, 2) to (good, average, poor)
def decode(labels):
    labels = labels.astype(str)
    labels = np.where(labels=="0", "poor", labels) 
    labels = np.where(labels=="1", "average", labels) 
    labels = np.where(labels=="2", "good", labels) 
    return labels


# Get the feature vector depending on various parameters
def get_features(texts, vocab, vec_type, features, min_gram, max_gram):
    if vec_type=='c':
        print("using COUNT")
        vectorizer = CountVectorizer(ngram_range=(min_gram , max_gram), max_features=features) 
    if vec_type =='t':
        print("using TFFF")
        vectorizer = TfidfVectorizer(ngram_range=(min_gram , max_gram), max_features=features)
    vectorizer.fit(vocab)
    features = vectorizer.transform(texts)
    features_nd = features.toarray()
    return features_nd

# Get the vectorizer(Useful for analyzing the type of words in the features)
def get_vectorizer(texts, vocab, vec_type, features, min_gram, max_gram):
    if type=='c':
        print("using COUNT")
        vectorizer = CountVectorizer(ngram_range=(min_gram , max_gram), max_features=features) 
    if type =='t':
        print("using TFFF")
        vectorizer = TfidfVectorizer(ngram_range=(min_gram , max_gram), max_features=features)
    vectorizer.fit(vocab)
    return vectorizer
    

    
# Sample CLassifiers for RF and Gradient Boosting
# clf = RandomForestClassifier(n_estimators=200, criterion="gini", max_depth=1000, random_state=0)

# clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100,  criterion='friedman_mse', max_depth=3, random_state=0)

# clf = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, criterion='mse', max_depth=3, random_state=0)



<h3>Method for trying out various configurations of the vectorizer</h3>
<p>Writing the accuracy value achieved(on cross validation on the training set) alongside the no. of features, the ngram range, the type of vectorizer to a csv file made it easy to search for the models with the best settings. These settings were used to train models that were used to predict labels for the test set. </p>

In [4]:
def try_everything():
    texts, labels = get_train_data(train_filename)
    vectorizers = ['c', 't']
    min_g = [0, 1]
    clf = ComplementNB() #Can use any algorithm here
    with open("CompNBModels.csv", 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["vectorizer", "features", "ngrams", "accuracy"])
        for vec in vectorizers: #Vectorizer is either Count or TF-IDF
            for f in range(2000, 4001, 2000): #Features ranging from 2000 to 80,000
                for m in min_g:
                    for ng in range(1,11): #ngram ranging from (0/1 to 10)
                        train_X, train_Y = get_features(texts, texts, vec, f, m, ng), encode(labels)
                        scores = cross_val_score(clf, train_X, train_Y, cv=4)
                        acc = scores.mean()
                        writer.writerow([vec, f, str(m)+'|'+str(ng), acc])

<h3>Data Preparation and Choosing Vectorizer Configurations</h3>

In [5]:
vec_type = 'c'
features = 31000
min_n, max_n = 0, 4
texts, labels = get_train_data(train_filename)
# Not doing any preproccessing since it negatively impacts accuracy
# texts = preprocess_all_texts(texts) 
train_X, train_Y = get_features(texts, texts, vec_type, features, min_n, max_n), encode(labels)
raw_test_X = get_test_data(test_filename)[0]
# Not doing any preproccessing since it negatively impacts accuracy
# raw_test_X = preprocess_all_texts(raw_test_X)
test_X = get_features(raw_test_X, texts, vec_type, features, min_n, max_n)


using COUNT
using COUNT


<h3>Cross validation on training set</h3>

In [6]:
# Cross validation on train data
clf = ComplementNB()
scores = cross_val_score(clf, train_X, train_Y, cv=4)
print(scores)
print(scores.mean())

[0.75844156 0.75324675 0.78571429 0.74382315]
0.7603064360866703


<h3>Training + Predicting + Writing predictions to CSV file</h3>

In [7]:
clf = ComplementNB()
clf.fit(train_X, train_Y)
pred = clf.predict(test_X)
d_pred = decode(pred)
test_ids = get_test_data(test_filename)[1]
write_csv(test_ids, d_pred, "submission.csv")

Wrote to a CSV file!
