In [9]:
import numpy as np
import json
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from nltk.sentiment.util import mark_negation
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
import re

In [10]:
def load_pickle(filename):
    with open(filename,"rb") as f:
        dump=pickle.load(f)
    return dump

def dump_pickle(dump,filename):
    with open(filename,"wb") as f:
        pickle.dump(dump,f)

In [11]:
train_file='data/A1_Data/toy.json'
dev_file='data/A1_Data/toy.json'
tokenized_train="tokenized_train.pkl"
tokenized_dev="tokenized_dev.pkl"
log=10**5

In [12]:
train_file='data/A1_Data/train.json'
dev_file='data/A1_Data/dev.json'

In [5]:
def read_data(filename):
    x=[]
    y=[]
    
    file_reader=open(filename,"r")
    for line in file_reader:
        mapping=json.loads(line)
        x.append(mapping['review'])
        y.append(mapping['ratings'])
    
    return x,y

In [6]:
train_sent,train_y=read_data(train_file)
dev_sent,dev_y=read_data(dev_file)
print(len(train_sent),len(dev_sent))

1000000 200000


## Clean Data

In [29]:
def clean_data(data):
    cleaned_data=[]
    count=0
    for line in data:
        count+=1
        if(count%log==0):
            print(count)
        purge=line
        purge=re.sub("((:\))|(:-\)))","good",purge)
        purge=re.sub("((:D)|(:-\)\)|(:-D)))","very good",purge)
        purge=re.sub("((:\())","bad",purge)
        purge=re.sub("((:p))","tricky",purge)
        purge=re.sub("((,)|(\n))"," ",purge)
        purge = " ".join(mark_negation(nltk.word_tokenize(purge), double_neg_flip=True, shallow=True))        
        cleaned_data.append(purge)
    return cleaned_data

## TF-IDF Vectoriser

In [49]:
lemmatizer=WordNetLemmatizer()
counter=0
def lemmatize_tokenize(text):
    global counter
    counter+=1
    if(counter%log==0):
        print(counter)
    word_arr=[]
    for word in word_tokenize(text):
        word_arr.append(lemmatizer.lemmatize(word))
    return word_arr

In [50]:
# vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,2),stop_words='english',max_df = 0.85,tokenizer = lemmatize_tokenize)

## Processing Data

In [51]:
# cleaned_train_data=clean_data(train_sent)
# cleaned_dev_data=clean_data(dev_sent)

## Saving Cleaned Data

In [52]:
# dump_pickle(cleaned_train_data,"cleaned_train.pkl")
# dump_pickle(cleaned_dev_data,"cleaned_dev.pkl")

## Loader Cleaned Data

In [53]:
train_x=load_pickle("cleaned_train.pkl")
dev_x=load_pickle("cleaned_dev.pkl")

In [54]:
print(len(train_x),len(dev_x))

1000000 200000


## TF-IDF Vectoriser

In [71]:
# vectorizer = TfidfVectorizer(tokenizer = lemmatize_tokenize,analyzer='word',ngram_range=(1,2),stop_words='english',max_df = 0.85)
# vectorizer.fit(train_x)
# train_x_tfidf=vectorizer.transform(train_x)
# dev_x_tfidf=vectorizer.transform(dev_x)
# train_x_tfidf,dev_x_tfidf

## Save TFIDF

In [58]:
# dump_pickle(train_x_tfidf,'train_tfidf.pkl')
# dump_pickle(dev_x_tfidf,'dev_tfidf.pkl')

## Load TF-IDF

In [59]:
train_x_tfidf=None
dev_x_tfidf=None
train_x_tfidf=load_pickle('train_tfidf.pkl')
dev_x_tfidf=load_pickle('dev_tfidf.pkl')
print(train_x_tfidf.shape,dev_x_tfidf.shape)

(1000000, 11357601) (200000, 11357601)


In [60]:
# Helper:https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/

In [61]:
def get_accuracy(gold,pred):
    length=len(gold)
    return np.sum(gold==pred)*1.0/length
def get_Fscore(gold,pred):
    return (f1_score(gold, pred, average='macro'),f1_score(gold, pred, average='micro'))      

## General Fitter

In [62]:
def GeneralFitter(X,Y,obj):
    %time obj.fit(X,Y)
    return obj

## Models

In [63]:
# nb=MultinomialNB()
# clf = LinearSVC(random_state=0, tol=1e-5,verbose = 5,max_iter=1000,class_weight='balanced')
# log_reg = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial', max_iter=100)
# reg = LinearRegression()

In [64]:
def get_stats(X,Y,dev_X,dev_Y,obj):
    pred_X=obj.predict(X)
    pred_dev=obj.predict(dev_X)
    print(get_accuracy(Y,pred_X))
    print(get_accuracy(dev_y,pred_dev))
    print(get_Fscore(Y,pred_X))
    print(get_Fscore(dev_y,pred_dev))
    print(mean_squared_error(dev_y,pred_dev))
    print(confusion_matrix(Y, pred_X))
    print(confusion_matrix(dev_y, pred_dev))

In [65]:
def get_reg_stats(X,Y,dev_X,dev_Y,obj):
    pred_X=obj.predict(X)
    pred_dev=obj.predict(dev_X)
    pred_X[pred_X[:]<1]=1
    pred_X[pred_X[:]>5]=5
    pred_dev[pred_dev[:]<1]=1
    pred_dev[pred_dev[:]>5]=5
    
    print(mean_squared_error(Y,pred_X))    
    print(mean_squared_error(dev_y,pred_dev))

## Feature filtering

In [85]:
selection_obj_tfidf=SelectKBest(chi2, k=200000)
filtered_train_tfidf = selection_obj_tfidf.fit_transform(train_x_tfidf, train_y)
filtered_dev_tfidf=selection_obj_tfidf.transform(dev_x_tfidf)
filtered_train_tfidf,filtered_dev_tfidf

(<1000000x200000 sparse matrix of type '<class 'numpy.float64'>'
 	with 85817275 stored elements in Compressed Sparse Row format>,
 <200000x200000 sparse matrix of type '<class 'numpy.float64'>'
 	with 17144738 stored elements in Compressed Sparse Row format>)

In [86]:
print(filtered_dev_tfidf.shape)

(200000, 200000)


## Linear Reg

In [81]:
reg = LinearRegression(n_jobs=-1)
GeneralFitter(filtered_train_tfidf,train_y,reg)

CPU times: user 1h 20min 33s, sys: 2.23 s, total: 1h 20min 35s
Wall time: 23min 55s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [82]:
get_reg_stats(filtered_train_tfidf,train_y,filtered_dev_tfidf,dev_y,reg)

0.43909863945742583
0.5550085894022871


In [90]:
ridge=linear_model.Ridge(alpha=2)
GeneralFitter(filtered_train_tfidf,train_y,ridge)
get_reg_stats(filtered_train_tfidf,train_y,filtered_dev_tfidf,dev_y,ridge)

CPU times: user 35.4 s, sys: 452 ms, total: 35.8 s
Wall time: 34.2 s
0.48160461722579856
0.5193662628538688


In [None]:
ridge=linear_model.Ridge(alpha=2)
GeneralFitter(train_x_tfidf,train_y,ridge)
get_reg_stats(train_x_tfidf,train_y,dev_x_tfidf,dev_y,ridge)