In [1]:
import numpy as np
import json
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from nltk.sentiment.util import mark_negation
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2,f_regression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
import re

In [2]:
def load_pickle(filename):
    with open(filename,"rb") as f:
        dump=pickle.load(f)
    return dump

def dump_pickle(dump,filename):
    with open(filename,"wb") as f:
        pickle.dump(dump,f)

In [3]:
train_file='data/A1_Data/toy.json'
dev_file='data/A1_Data/toy.json'
tokenized_train="tokenized_train.pkl"
tokenized_dev="tokenized_dev.pkl"
log=10**5

In [4]:
train_file='data/A1_Data/train.json'
dev_file='data/A1_Data/dev.json'

In [5]:
def read_data(filename):
    x=[]
    y=[]
    
    file_reader=open(filename,"r")
    for line in file_reader:
        mapping=json.loads(line)
        x.append(mapping['review'])
        y.append(mapping['ratings'])
    
    return x,y

In [6]:
train_sent,train_y=read_data(train_file)
dev_sent,dev_y=read_data(dev_file)
print(len(train_sent),len(dev_sent))

1000000 200000


## Clean Data

In [7]:
def clean_data(data):
    cleaned_data=[]
    count=0
    for line in data:
        count+=1
        if(count%log==0):
            print(count)
        purge=line
        purge=re.sub("((:\))|(:-\)))","good",purge)
        purge=re.sub("((:D)|(:-\)\)|(:-D)))","very good",purge)
        purge=re.sub("((:\())","bad",purge)
        purge=re.sub("((:p))","tricky",purge)
        purge=re.sub("((,)|(\n))"," ",purge)
        purge = mark_negation(nltk.word_tokenize(purge), double_neg_flip=True, shallow=True)
        new_purge=[]
        for x in purge:
            if x not in new_purge:
                new_purge.append(x)
        purge=new_purge
        purge= " ".join(purge)
        cleaned_data.append(purge)
    return cleaned_data

## TF-IDF Vectoriser

In [8]:
lemmatizer=WordNetLemmatizer()
counter=0
def lemmatize_tokenize(text):
    global counter
    counter+=1
    if(counter%log==0):
        print(counter)
    word_arr=[]
    for word in word_tokenize(text):
        word_arr.append(lemmatizer.lemmatize(word))
    return word_arr

In [9]:
# vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,2),stop_words='english',max_df = 0.85,tokenizer = lemmatize_tokenize)

## Processing Data

In [10]:
# cleaned_train_data=clean_data(train_sent)
# cleaned_dev_data=clean_data(dev_sent)

## Saving Cleaned Data

In [11]:
# dump_pickle(cleaned_train_data,"cleaned_train.pkl")
# dump_pickle(cleaned_dev_data,"cleaned_dev.pkl")

## Loader Cleaned Data

In [12]:
train_x=load_pickle("cleaned_train.pkl")
dev_x=load_pickle("cleaned_dev.pkl")

In [13]:
print(len(train_x),len(dev_x))

1000000 200000


## TF-IDF Vectoriser

In [14]:
vectorizer = TfidfVectorizer(max_features=1000000,tokenizer = lemmatize_tokenize,analyzer='word',ngram_range=(1,2))
temp=train_x+dev_x
%time train_test_x_tfidf=vectorizer.fit_transform(temp)
train_x_tfidf=train_test_x_tfidf[0:len(train_x)]
dev_x_tfidf=train_test_x_tfidf[len(train_x):]
train_x_tfidf,dev_x_tfidf

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
CPU times: user 19min 53s, sys: 8.53 s, total: 20min 2s
Wall time: 20min


(<1000000x1000000 sparse matrix of type '<class 'numpy.float64'>'
 	with 146157012 stored elements in Compressed Sparse Row format>,
 <200000x1000000 sparse matrix of type '<class 'numpy.float64'>'
 	with 29252923 stored elements in Compressed Sparse Row format>)

## COunt Vectoriser

In [16]:
# vectorizer_count = CountVectorizer(max_features=1000000,tokenizer = lemmatize_tokenize,analyzer='word',ngram_range=(1,2),max_df=0.85)
# temp=train_x+dev_x
# %time train_test_x_count=vectorizer_count.fit_transform(temp)
# train_x_count=train_test_x_count[0:len(train_x)]
# dev_x_count=train_test_x_count[len(train_x):]
# train_x_count,dev_x_count

## Save TFIDF

In [17]:
# dump_pickle(train_x_tfidf,'train_tfidf.pkl')
# dump_pickle(dev_x_tfidf,'dev_tfidf.pkl')
# dump_pickle(train_test_x_tfidf,'train_dev_tfidf.pkl')

# Save Count

In [18]:
# dump_pickle(train_x_count,'train_count.pkl')
# dump_pickle(dev_x_count,'dev_count.pkl')
# dump_pickle(train_test_x_count,'train_dev_count.pkl')

## Load TF-IDF

In [19]:
train_x_tfidf=None
dev_x_tfidf=None
train_x_tfidf=load_pickle('train_tfidf.pkl')
dev_x_tfidf=load_pickle('dev_tfidf.pkl')
print(train_x_tfidf.shape,dev_x_tfidf.shape)

(1000000, 1000000) (200000, 1000000)


In [20]:
# train_x_count=None
# dev_x_count=None
# train_x_count=load_pickle('train_tfidf.pkl')
# dev_x_count=load_pickle('dev_tfidf.pkl')
# print(train_x_count.shape,dev_x_count.shape)

In [40]:
# Helper:https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/

In [21]:
def get_accuracy(gold,pred):
    length=len(gold)
    return np.sum(gold==pred)*1.0/length
def get_Fscore(gold,pred):
    return (f1_score(gold, pred, average='macro'),f1_score(gold, pred, average='micro'))      

## General Fitter

In [22]:
def GeneralFitter(X,Y,obj):
    %time obj.fit(X,Y)
    return obj

## Models

In [23]:
# nb=MultinomialNB()
# clf = LinearSVC(random_state=0, tol=1e-5,verbose = 5,max_iter=1000,class_weight='balanced')
# log_reg = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial', max_iter=100)
# reg = LinearRegression()

In [24]:
def get_stats(X,Y,dev_X,dev_Y,obj):
    pred_X=obj.predict(X)
    pred_dev=obj.predict(dev_X)
    print(get_accuracy(Y,pred_X))
    print(get_accuracy(dev_y,pred_dev))
    print(get_Fscore(Y,pred_X))
    print(get_Fscore(dev_y,pred_dev))
    print(mean_squared_error(dev_y,pred_dev))
    print(confusion_matrix(Y, pred_X))
    print(confusion_matrix(dev_y, pred_dev))

In [25]:
def get_reg_stats(X,Y,dev_X,dev_Y,obj):
    pred_X=obj.predict(X)
    pred_dev=obj.predict(dev_X)
    pred_X[pred_X[:]<1]=1
    pred_X[pred_X[:]>5]=5
    pred_dev[pred_dev[:]<1]=1
    pred_dev[pred_dev[:]>5]=5
    
    print(mean_squared_error(Y,pred_X))    
    print(mean_squared_error(dev_y,pred_dev))

## Feature filtering

In [26]:
# selection_obj_tfidf=SelectKBest(f_regression, k=100000)
# filtered_train_tfidf = selection_obj_tfidf.fit_transform(train_x_tfidf, train_y)
# filtered_dev_tfidf=selection_obj_tfidf.transform(dev_x_tfidf)
# filtered_train_tfidf,filtered_dev_tfidf

In [27]:
print(filtered_dev_tfidf.shape)

NameError: name 'filtered_dev_tfidf' is not defined

## Linear Reg

In [48]:
# reg = LinearRegression(n_jobs=-1)
# GeneralFitter(filtered_train_tfidf,train_y,reg)

In [49]:
# get_reg_stats(filtered_train_tfidf,train_y,filtered_dev_tfidf,dev_y,reg)

In [59]:
ridge=linear_model.Ridge(alpha=2)
GeneralFitter(train_x_tfidf,train_y,ridge)
get_reg_stats(train_x_tfidf,train_y,dev_tfidf,dev_y,ridge)

CPU times: user 28.3 s, sys: 392 ms, total: 28.7 s
Wall time: 27.1 s
0.508253193025252
0.55289006341852


In [60]:
ridge=linear_model.Ridge(alpha=4)
GeneralFitter(filtered_train_tfidf,train_y,ridge)
get_reg_stats(filtered_train_tfidf,train_y,filtered_dev_tfidf,dev_y,ridge)

CPU times: user 25.2 s, sys: 296 ms, total: 25.5 s
Wall time: 23.8 s
0.533518284326126
0.5661441872430603


## Logistic

In [21]:
log_reg_tf=LogisticRegression(random_state=0, solver='sag',multi_class='multinomial', max_iter=100,n_jobs=-1)
GeneralFitter(train_x_tfidf,train_y,log_reg_tf)
get_stats(train_x_tfidf,train_y,dev_x_tfidf,dev_y,log_reg_tf)

CPU times: user 4min 40s, sys: 1.27 s, total: 4min 41s
Wall time: 4min 37s
0.84246
0.701565
(0.8030076933116147, 0.84246)
(0.6132428734442106, 0.701565)
0.563635
[[140570   4706   2030    862   1992]
 [ 16092  51477   7590   3371   2740]
 [  6165   4151  74912  17180   8173]
 [  1985   1029   5803 159573  51238]
 [  1607    265   1426  19135 415928]]
[[25430  2567   689   313   790]
 [ 5385  5698  3504   881   671]
 [ 1714  2431  9421  6587  1963]
 [  558   364  3144 22678 17262]
 [  647   108   484  9625 77086]]


In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit


In [None]:
model=linear_model.Ridge(alpha=2)
cv = ShuffleSplit(n_splits=2, test_size=0.25, random_state=0)
cv_results = cross_validate(model, X, y, cv=cv,return_train_score=True,return_estimator=True,n_jobs=-1,scoring='neg_mean_squared_error')