In [5]:
import re
import os
from tqdm import tqdm

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import seaborn as sns  
sns.set_style("darkgrid")

import nltk
from nltk.corpus import stopwords                                    # Stopwords corpus
from nltk.stem import PorterStemmer                                  # Stemmer

from sklearn import model_selection, preprocessing, linear_model, metrics, svm
from sklearn import decomposition, ensemble

from sklearn.feature_extraction.text import CountVectorizer          # For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          # For TF-IDF
from gensim.models import Word2Vec                                   # For Word2Vec

import warnings
warnings.filterwarnings("ignore")

In [6]:
os.listdir('.')

['test01.ipynb', 'test.csv', 'readme.txt', 'train.csv', '.ipynb_checkpoints']

## Preparation

In [7]:
# Input Dataset
test_file = pd.read_csv('test.csv', header=None)
train_file = pd.read_csv('train.csv', header=None)

In [8]:
# Merge title and content
test_file['Comment'] = test_file[1]+' '+test_file[2]
train_file['Comment'] = train_file[1]+' '+train_file[2]
test_file = test_file.drop(columns=[1,2])
train_file = train_file.drop(columns=[1,2])

In [9]:
# Edit label
train_file[0] = train_file[0].map(lambda x: x-1)
test_file[0] = test_file[0].map(lambda x: x-1)

In [10]:
train_file[:5]

Unnamed: 0,0,Comment
0,1,Stuning even for the non-gamer This sound trac...
1,1,The best soundtrack ever to anything. I'm read...
2,1,Amazing! This soundtrack is my favorite music ...
3,1,Excellent Soundtrack I truly like this soundtr...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


In [11]:
test_file[:5]

Unnamed: 0,0,Comment
0,1,Great CD My lovely Pat has one of the GREAT vo...
1,1,One of the best game music soundtracks - for a...
2,0,Batteries died within a year ... I bought this...
3,1,"works fine, but Maha Energy is better Check ou..."
4,1,Great for the non-audiophile Reviewed quite a ...


In [12]:
stop = set(stopwords.words('english'))
print(stop)

{'couldn', "she's", "wasn't", 'all', 'some', 'can', 'is', 'or', 'will', 'needn', 'him', 'ours', 'didn', 'should', "weren't", 'for', 'ain', 'now', 'under', 's', 'whom', 'during', 'above', 'shouldn', 'themselves', "you've", 'were', "hasn't", 'these', 'me', 'off', 'to', 'not', 'down', 'the', "aren't", 'very', 'her', 'doing', 'yours', 'and', "it's", 'our', 'so', 'most', 'hadn', 'don', 'out', 'up', 'are', 'other', 'your', 'only', 'them', 'as', 'below', 'each', 'just', 'than', "couldn't", 'once', "that'll", 'himself', 'those', 'has', 'y', 'there', 'nor', "doesn't", "didn't", 'where', 'why', 'she', 'same', 'both', 'own', 'have', 'they', "don't", 'wasn', 'against', 'such', 'again', 'from', "wouldn't", 'you', 'being', 'mightn', 'then', 'on', 'over', 'before', 'its', 'yourself', 'do', 'through', 'isn', 'by', 'he', 'shan', 'what', 'be', "shouldn't", "you'd", "shan't", 'was', "isn't", 'does', 'how', 'if', 'no', 'am', 're', 'a', 'having', 'aren', 'his', 'at', "you're", 'herself', 'o', "hadn't", 'wi

In [13]:
# Preprocessing
def prepr(X):
    temp =[]
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in tqdm(X):
        # Converting to lowercase
        sentence = str(sentence).lower()
        # Removing HTML tags
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)
        sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
        # Removing Punctuations
        sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
        # Stemming and removing stopwords
        words = ' '.join([snow.stem(word) for word in sentence.split() if word not in stopwords.words('english')])
        temp.append(words)

    return temp

In [14]:
# 10000 size sample
train_s_X = prepr(train_file['Comment'][:10000])

100%|██████████| 10000/10000 [02:12<00:00, 75.31it/s]


In [15]:
train_s_X[:2]

['stune even non-gam sound track beauti paint seneri mind well would recomend even peopl hate vid game music play game chrono cross game ever play best music back away crude keyboard take fresher step grate guitar soul orchestra would impress anyon care listen ^_^',
 'best soundtrack ever anyth im read lot review say best game soundtrack figur id write review disagre bit opinino yasunori mitsuda ultim masterpiec music timeless im listen year beauti simpli refus fade price tag pretti stagger must say go buy cd much money one feel would worth everi penni']

In [16]:
trainDF = pd.DataFrame()
trainDF['text'] = train_s_X
trainDF['label'] = list(train_file[0][:10000])
trainDF.head(10)

Unnamed: 0,text,label
0,stune even non-gam sound track beauti paint se...,1
1,best soundtrack ever anyth im read lot review ...,1
2,amaz soundtrack favorit music time hand intens...,1
3,excel soundtrack truli like soundtrack enjoy v...,1
4,rememb pull jaw floor hear youv play game know...,1
5,absolut masterpiec quit sure actual take time ...,1
6,buyer bewar self-publish book want know why--r...,0
7,glorious stori love whisper wick saint stori a...,1
8,five star book finish read whisper wick saint ...,1
9,whisper wick saint easi read book made want ke...,1


In [17]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [18]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

## Training (Traditional ML)

In [19]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [20]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

LR, WordLevel TF-IDF:  0.8552
LR, N-Gram Vectors:  0.7864
LR, CharLevel Vectors:  0.8256


In [21]:
# SVM on on Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, WordLevel TF-IDF: ", accuracy)

# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

# SVM on Character Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("SVM, CharLevel Vectors: ", accuracy)

SVM, WordLevel TF-IDF:  0.5088
SVM, N-Gram Vectors:  0.5088
SVM, CharLevel Vectors:  0.5088


## Training (NN)

In [22]:
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [None]:
# Unigram 
count_vect = CountVectorizer(max_features=10000) # top 10000 most frequently repeated words 
small_sample_1N = count_vect.fit_transform(small_sample)
print(small_sample_1N[0])

In [None]:
# Bigram
count_vect = CountVectorizer(ngram_range=(1,2), max_features=10000)
small_sample_2N = count_vect.fit_transform(small_sample)
print(small_sample_2N[0])

In [None]:
# Trigram
count_vect = CountVectorizer(ngram_range=(1,3), max_features=10000)
small_sample_3N = count_vect.fit_transform(small_sample)
print(small_sample_3N[0])

In [None]:
# TF-IDF
tf_idf = TfidfVectorizer(max_features=10000)
small_sample_TF = tf_idf.fit_transform(small_sample)
print(small_sample_TF[0])

In [None]:
# Word2Vec
small_sample_splitted = []
for row in tqdm(small_sample): 
    small_sample_splitted.append([word for word in row.split()])                    # splitting words

small_sample_w2v = Word2Vec(small_sample_splitted, min_count=5, size=50, workers=8) # 50 hidden layers

In [None]:
# Average Word2Vec
small_sample_avg_w2v = []
for row in tqdm(small_sample_splitted):
    vec = np.zeros(50)
    count = 0
    for word in row:
        try:
            vec += small_sample_w2v[word]
            count += 1
        except:
            pass
    small_sample_avg_w2v.append(vec/count)
    
print(small_sample_avg_w2v[0])

In [None]:
# TF-IDF Word2Vec

small_sample_TF_w2v = []
small_sample_TF_data = small_sample_TF.toarray()
i = 0
for row in tqdm(small_sample_splitted):
    vec = [0 for i in range(50)]
    
    temp_tf = []
    for val in small_sample_TF_data[i]:
        if val != 0:
            temp_tf.append(val)
    
    count = 0
    tf_idf_sum = 0
    for word in row:
        try:
            count += 1
            tf_idf_sum += temp_tf[count-1]
            vec += (temp_tf[count-1] * small_sample_w2v[word])
        except:
            pass
    #print(tf_idf_sum, vec)
    vec = float(1 / tf_idf_sum) * vec
    small_sample_TF_w2v.append(vec)
    i = i + 1

print(small_sample_TF_w2v[0])

In [None]:
BoW1N_DF = pd.DataFrame()
BoW1N_DF['label'] = small_sample_label
BoW1N_DF['text'] = list(small_sample_1N)

BoW2N_DF = pd.DataFrame()
BoW2N_DF['label'] = small_sample_label
BoW2N_DF['text'] = list(small_sample_2N)

BoW3N_DF = pd.DataFrame()
BoW3N_DF['label'] = small_sample_label
BoW3N_DF['text'] = list(small_sample_3N)

tf_DF = pd.DataFrame()
tf_DF['label'] = small_sample_label
tf_DF['text'] = list(small_sample_TF)

w2v_DF = pd.DataFrame()
w2v_DF['label'] = small_sample_label
w2v_DF['text'] = small_sample_avg_w2v

tf_w2v_DF = pd.DataFrame()
tf_w2v_DF['label'] = small_sample_label
tf_w2v_DF['text'] = small_sample_TF_w2v

In [None]:
encoder = preprocessing.LabelEncoder()

BoW1N_train_x, BoW1N_valid_x, BoW1N_train_y, BoW1N_valid_y = model_selection.train_test_split(BoW1N_DF['text'], BoW1N_DF['label'])
BoW1N_train_y = encoder.fit_transform(BoW1N_train_y)
BoW1N_valid_y = encoder.fit_transform(BoW1N_valid_y)

BoW2N_train_x, BoW2N_valid_x, BoW2N_train_y, BoW2N_valid_y = model_selection.train_test_split(BoW2N_DF['text'], BoW2N_DF['label'])
BoW2N_train_y = encoder.fit_transform(BoW2N_train_y)
BoW2N_valid_y = encoder.fit_transform(BoW2N_valid_y)

BoW3N_train_x, BoW3N_valid_x, BoW3N_train_y, BoW3N_valid_y = model_selection.train_test_split(BoW3N_DF['text'], BoW3N_DF['label'])
BoW3N_train_y = encoder.fit_transform(BoW3N_train_y)
BoW3N_valid_y = encoder.fit_transform(BoW3N_valid_y)

tf_train_x, tf_valid_x, tf_train_y, tf_valid_y = model_selection.train_test_split(tf_DF['text'], tf_DF['label'])
tf_train_y = encoder.fit_transform(tf_train_y)
tf_valid_y = encoder.fit_transform(tf_valid_y)

w2v_train_x, w2v_valid_x, w2v_train_y, w2v_valid_y = model_selection.train_test_split(w2v_DF['text'], w2v_DF['label'])
w2v_train_y = encoder.fit_transform(w2v_train_y)
w2v_valid_y = encoder.fit_transform(w2v_valid_y)

tf_w2v_train_x, tf_w2v_valid_x, tf_w2v_train_y, tf_w2v_valid_y = model_selection.train_test_split(tf_w2v_DF['text'], tf_w2v_DF['label'])
tf_w2v_train_y = encoder.fit_transform(tf_w2v_train_y)
tf_w2v_valid_y = encoder.fit_transform(tf_w2v_valid_y)

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # Fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # Predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), BoW1N_train_x, BoW1N_train_y, BoW1N_valid_x)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)