### Import required libraries

In [1]:
import pandas as pd
import numpy as np
from math import floor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from time import time
from sklearn.metrics import accuracy_score, f1_score, classification_report
import re
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
import gensim
import time

unable to import 'smart_open.gcs', disabling that module


You are given the reviews dataset. These are 194439 amazon reviews for cell phones and accessories taken from https://jmcauley.ucsd.edu/data/amazon/ Use the “reviewText” and “overall” fields from this file. The goal is to predict the rating given the review by modeling it as a multi-class classification problem.

In [2]:
data = pd.read_json("Cell_Phones_and_Accessories_5.json", lines=True)

In [3]:
data.shape

(194439, 9)

The dataset has 194439 rows and 9 columns

In [4]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


### Use the “reviewText” and “overall” fields from this file

In [5]:
data['reviewText'] = data['reviewText'] + data['summary'] 

In [6]:
data = data[['reviewText', 'overall']]

In [7]:
data.head()

Unnamed: 0,reviewText,overall
0,They look good and stick good! I just don't li...,4
1,These stickers work like the review says they ...,5
2,These are awesome and make my phone look so st...,5
3,Item arrived in great time and was in perfect ...,4
4,"awesome! stays on, and looks great. can be use...",5


In [8]:
#from nltk.stem import WordNetLemmatizer
  
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()
#stemmer = nltk.stem.porter.PorterStemmer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    return tokens

data['processed_text'] = data['reviewText'].apply(preprocess_text)

In [9]:
data.head()

Unnamed: 0,reviewText,overall,processed_text
0,They look good and stick good! I just don't li...,4,"[look, good, stick, good, n't, like, rounded, ..."
1,These stickers work like the review says they ...,5,"[sticker, work, like, review, say, stick, grea..."
2,These are awesome and make my phone look so st...,5,"[awesome, make, phone, look, stylish, used, on..."
3,Item arrived in great time and was in perfect ...,4,"[item, arrived, great, time, perfect, conditio..."
4,"awesome! stays on, and looks great. can be use...",5,"[awesome, stay, look, great, used, multiple, a..."


In [10]:
#data.overall = data.overall.astype(object)

In [11]:
#data.describe(include='all')

In [12]:
#data.overall.value_counts()

### Check for Null reviews

Looks like there are None

In [21]:
data.isna().sum()

reviewText        0
overall           0
processed_text    0
dtype: int64

In [22]:
tokens =[]
for text in data.processed_text:
    tokens.append(text)

In [23]:
data.overall.value_counts()

5    108664
4     39993
3     21439
1     13279
2     11064
Name: overall, dtype: int64

## 1.	Take the first 70% dataset for train, next 10% for validation/development, and remaining 20% for test. 

In [24]:
train = data[0:floor(0.7*len(data))]
validate = data[floor(0.7*len(data)):floor(0.8*len(data))]
test = data[floor(0.8*len(data)):]

print(data.shape, train.shape, validate.shape, test.shape)

del data

(194439, 3) (136107, 3) (19444, 3) (38888, 3)


In [25]:
print(train.overall.value_counts())
print(validate.overall.value_counts())
print(test.overall.value_counts())

5    73942
4    27955
3    15843
1    10141
2     8226
Name: overall, dtype: int64
5    11035
4     4072
3     2123
1     1175
2     1039
Name: overall, dtype: int64
5    23687
4     7966
3     3473
1     1963
2     1799
Name: overall, dtype: int64


In [26]:
train1 = train[train.overall == 5]
train1 = train1.sample(frac=0.45, replace=True, random_state=1)

In [27]:
train = train[train.overall != 5]
train = pd.concat([train, train1], ignore_index=True)
train.shape

(95439, 3)

In [28]:
#print(train)
del train1

                                              reviewText  overall  \
0      They look good and stick good! I just don't li...        4   
1      Item arrived in great time and was in perfect ...        4   
2      These make using the home button easy. My daug...        3   
3      it worked for the first week then it only char...        1   
4      It worked great for the first couple of weeks ...        1   
...                                                  ...      ...   
95434  I have a long commute to work and wish not to ...        5   
95435  I am super happy with this product.  I bought ...        5   
95436  It fits my phone perfectly and It installed fl...        5   
95437  I love my HTC  Evo, but battery life is an iss...        5   
95438  It is not bulky and the way it stays on the ph...        5   

                                          processed_text  
0      [look, good, stick, good, n't, like, rounded, ...  
1      [item, arrived, great, time, perfect, conditio

## 2.	Traditional machine learning methods

a.	Design some good linguistic features. You can start with basic TFIDF features. Use these classifiers: J48 decision trees, SVMs with linear/RBF kernel, logistic regression, xgboost, random forests and report accuracy on test set.

In [21]:
print("Extracting tf-idf features...")
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000, max_df=0.95, min_df=2, stop_words='english') #USE HELP TO SEE WHAT EACH DOES
t0 = time.time()
tfidf_train = tfidf_vectorizer.fit_transform(train['reviewText'])
tfidf_train = pd.DataFrame(tfidf_train.todense(), columns=tfidf_vectorizer.get_feature_names())
print("done in %0.3fs." % (time.time() - t0))

Extracting tf-idf features...
done in 30.619s.


In [22]:
tfidf_validate = pd.DataFrame(tfidf_vectorizer.transform(validate['reviewText']).todense(), columns=tfidf_vectorizer.get_feature_names())
tfidf_test = pd.DataFrame(tfidf_vectorizer.transform(test['reviewText']).todense(), columns=tfidf_vectorizer.get_feature_names())

Decision Tree Classifier

In [23]:
del train1

In [28]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(tfidf_train, train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(tfidf_train)))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(tfidf_validate)))
print("TEST: \n", classification_report(test.overall, clf.predict(tfidf_test)))

# print("TRAIN F1_Score: ", f1_score(train.overall, clf.predict(tfidf_train,), average='macro'))
# print("VALIDATE F1_Score:", f1_score(validate.overall, clf.predict(tfidf_validate), average='macro'))
# print("TEST F1_Score: ", f1_score(test.overall, clf.predict(tfidf_test), average='macro'))

TRAIN: 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00     10141
           2       1.00      1.00      1.00      8226
           3       1.00      1.00      1.00     15843
           4       1.00      1.00      1.00     27955
           5       1.00      1.00      1.00     33274

    accuracy                           1.00     95439
   macro avg       1.00      1.00      1.00     95439
weighted avg       1.00      1.00      1.00     95439

VALIDATE: 
               precision    recall  f1-score   support

           1       0.31      0.38      0.34      1175
           2       0.11      0.14      0.12      1039
           3       0.21      0.28      0.24      2123
           4       0.27      0.44      0.34      4072
           5       0.75      0.50      0.60     11035

    accuracy                           0.44     19444
   macro avg       0.33      0.35      0.33     19444
weighted avg       0.53      0.44      0.47     19444

T

SVM Classifier

In [29]:
from sklearn.svm import SVC, LinearSVC

clf = LinearSVC()
clf.fit(tfidf_train, train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(tfidf_train)))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(tfidf_validate)))
print("TEST: \n", classification_report(test.overall, clf.predict(tfidf_test)))

TRAIN: 
               precision    recall  f1-score   support

           1       0.57      0.71      0.63     10141
           2       0.44      0.07      0.12      8226
           3       0.51      0.32      0.39     15843
           4       0.50      0.52      0.51     27955
           5       0.62      0.80      0.70     33274

    accuracy                           0.57     95439
   macro avg       0.53      0.48      0.47     95439
weighted avg       0.55      0.57      0.54     95439

VALIDATE: 
               precision    recall  f1-score   support

           1       0.50      0.66      0.57      1175
           2       0.36      0.06      0.11      1039
           3       0.45      0.28      0.34      2123
           4       0.40      0.53      0.45      4072
           5       0.79      0.79      0.79     11035

    accuracy                           0.63     19444
   macro avg       0.50      0.46      0.45     19444
weighted avg       0.63      0.63      0.62     19444

T

Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(tfidf_train, train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(tfidf_train)))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(tfidf_validate)))
print("TEST: \n", classification_report(test.overall, clf.predict(tfidf_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.60      0.67      0.64     10141
           2       0.42      0.16      0.23      8226
           3       0.48      0.39      0.43     15843
           4       0.51      0.54      0.53     27955
           5       0.65      0.76      0.70     33274

    accuracy                           0.57     95439
   macro avg       0.53      0.50      0.50     95439
weighted avg       0.56      0.57      0.56     95439

VALIDATE: 
               precision    recall  f1-score   support

           1       0.54      0.62      0.57      1175
           2       0.33      0.13      0.19      1039
           3       0.41      0.35      0.38      2123
           4       0.39      0.54      0.46      4072
           5       0.81      0.75      0.78     11035

    accuracy                           0.62     19444
   macro avg       0.50      0.48      0.48     19444
weighted avg       0.64      0.62      0.63     19444

T

In [31]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(tfidf_train, train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(tfidf_train)))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(tfidf_validate)))
print("TEST: \n", classification_report(test.overall, clf.predict(tfidf_test)))

TRAIN: 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00     10141
           2       1.00      1.00      1.00      8226
           3       1.00      1.00      1.00     15843
           4       1.00      1.00      1.00     27955
           5       1.00      1.00      1.00     33274

    accuracy                           1.00     95439
   macro avg       1.00      1.00      1.00     95439
weighted avg       1.00      1.00      1.00     95439

VALIDATE: 
               precision    recall  f1-score   support

           1       0.50      0.58      0.54      1175
           2       0.39      0.04      0.07      1039
           3       0.42      0.24      0.30      2123
           4       0.34      0.63      0.44      4072
           5       0.81      0.68      0.74     11035

    accuracy                           0.58     19444
   macro avg       0.49      0.43      0.42     19444
weighted avg       0.63      0.58      0.58     19444

T

In [32]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(train.overall)

In [33]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="multi:softmax", random_state=42)
clf.fit(tfidf_train, y_train)

train_pred = clf.predict(tfidf_train)
train_pred = le.inverse_transform(train_pred)

test_pred = clf.predict(tfidf_test)
test_pred = le.inverse_transform(test_pred)

validate_pred = clf.predict(tfidf_validate)
validate_pred = le.inverse_transform(validate_pred)

print("TRAIN: \n", classification_report(train.overall, train_pred))
print("VALIDATE: \n", classification_report(validate.overall, validate_pred))
print("TEST: \n", classification_report(test.overall, test_pred))

TRAIN: 
               precision    recall  f1-score   support

           1       0.70      0.72      0.71     10141
           2       0.81      0.31      0.44      8226
           3       0.68      0.45      0.54     15843
           4       0.59      0.64      0.62     27955
           5       0.68      0.84      0.75     33274

    accuracy                           0.66     95439
   macro avg       0.69      0.59      0.61     95439
weighted avg       0.67      0.66      0.65     95439

VALIDATE: 
               precision    recall  f1-score   support

           1       0.52      0.56      0.54      1175
           2       0.34      0.10      0.15      1039
           3       0.43      0.27      0.33      2123
           4       0.37      0.57      0.45      4072
           5       0.79      0.74      0.77     11035

    accuracy                           0.61     19444
   macro avg       0.49      0.45      0.45     19444
weighted avg       0.63      0.61      0.61     19444

T

In [None]:
clf = SVC(kernel = 'rbf')
clf.fit(tfidf_train, train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(tfidf_train)))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(tfidf_validate)))
print("TEST: \n", classification_report(test.overall, clf.predict(tfidf_test)))

### 3.	Average of word embeddings
a.	Learn word2vec models using gensim on this dataset with the following settings: (a) Size=100, 200, 300, (b) Window=3,7, (c) Min_count=2, 5. Use skipgram.

i.	This will give 12 word2vec models. For each of these models, for each review take average word embeddings and train a logistic regression. Report accuracy on test set. 


In [29]:
%%time
vector_size = [100, 200, 300]
window_size = [3, 7]
min_count = [2, 5]
#model = gensim.models.Word2Vec(data['processed_text'], size=vector_size, window=window_size, min_count=min_count, workers=4,sg=1)

Wall time: 0 ns


In [30]:
def get_embeddings(text):
    embeddings = []
    for word in text:
        try:
            embeddings.append(model.wv[word])
        except KeyError:
            pass
    embeddings = np.mean(embeddings, axis = 0)
    embeddings = embeddings.tolist()
    return embeddings

In [31]:
#data['embeddings'] = data['processed_text'].apply(get_embeddings)

In [32]:
for vector in vector_size:
    for window in window_size:
        for count in min_count:
            start = time.time()
            print("WORD EMBEDDING ----- vector_size - ", vector, " window_size - ", window, " min_count - ", count)
            model = gensim.models.Word2Vec(data['processed_text'], size=vector, window=window, min_count=count, workers=4,sg=1)
            data['embeddings'] = data['processed_text'].apply(get_embeddings)
            df3 = data.embeddings.apply(pd.Series)
            df3 = df3.fillna(0)
            df3['overall'] = data.overall
            
            train = df3[0:floor(0.7*len(df3))]
            validate = df3[floor(0.7*len(df3)):floor(0.8*len(df3))]
            test = df3[floor(0.8*len(df3)):]
            
            clf = LogisticRegression()
            clf.fit(train.iloc[:, :vector], train.overall)

            print("TRAIN: \n", classification_report(train.overall, clf.predict(train.iloc[:, :vector])))
            print("VALIDATE: \n", classification_report(validate.overall, clf.predict(validate.iloc[:, :vector])))
            print("TEST: \n", classification_report(test.overall, clf.predict(test.iloc[:, :vector])))
            
            stop = time.time()
            duration = stop - start
            
            print("Took %f seconds for execution\n ============================================================="%duration)

WORD EMBEDDING ----- vector_size -  100  window_size -  3  min_count -  2


  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.55      0.55      0.55     10141
           2       0.32      0.05      0.08      8226
           3       0.36      0.24      0.29     15843
           4       0.41      0.19      0.26     27955
           5       0.67      0.93      0.78     73942

    accuracy                           0.61    136107
   macro avg       0.46      0.39      0.39    136107
weighted avg       0.55      0.61      0.56    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.52      0.54      0.53      1175
           2       0.39      0.06      0.10      1039
           3       0.38      0.25      0.30      2123
           4       0.43      0.20      0.27      4072
           5       0.69      0.93      0.79     11035

    accuracy                           0.63     19444
   macro avg       0.48      0.39      0.40     19444
weighted avg       0.58      0.63      0.58     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.54      0.55      0.55     10141
           2       0.33      0.05      0.09      8226
           3       0.36      0.24      0.29     15843
           4       0.41      0.19      0.26     27955
           5       0.68      0.92      0.78     73942

    accuracy                           0.61    136107
   macro avg       0.46      0.39      0.39    136107
weighted avg       0.55      0.61      0.56    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.52      0.54      0.53      1175
           2       0.40      0.07      0.11      1039
           3       0.38      0.23      0.29      2123
           4       0.42      0.21      0.28      4072
           5       0.70      0.93      0.80     11035

    accuracy                           0.63     19444
   macro avg       0.48      0.40      0.40     19444
weighted avg       0.58      0.63      0.58     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.56      0.57      0.56     10141
           2       0.35      0.06      0.10      8226
           3       0.37      0.26      0.31     15843
           4       0.42      0.20      0.27     27955
           5       0.68      0.93      0.79     73942

    accuracy                           0.62    136107
   macro avg       0.47      0.40      0.40    136107
weighted avg       0.56      0.62      0.56    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.53      0.54      0.53      1175
           2       0.35      0.06      0.10      1039
           3       0.39      0.26      0.31      2123
           4       0.43      0.21      0.28      4072
           5       0.70      0.93      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.48      0.40      0.40     19444
weighted avg       0.58      0.64      0.58     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.55      0.56      0.56     10141
           2       0.35      0.06      0.10      8226
           3       0.37      0.26      0.30     15843
           4       0.42      0.20      0.27     27955
           5       0.68      0.92      0.79     73942

    accuracy                           0.62    136107
   macro avg       0.47      0.40      0.40    136107
weighted avg       0.56      0.62      0.57    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.53      0.54      0.54      1175
           2       0.40      0.07      0.12      1039
           3       0.39      0.25      0.30      2123
           4       0.43      0.22      0.29      4072
           5       0.70      0.93      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.49      0.40      0.41     19444
weighted avg       0.58      0.64      0.59     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.56      0.57      0.57     10141
           2       0.35      0.08      0.12      8226
           3       0.39      0.26      0.31     15843
           4       0.43      0.22      0.29     27955
           5       0.69      0.92      0.79     73942

    accuracy                           0.62    136107
   macro avg       0.48      0.41      0.41    136107
weighted avg       0.57      0.62      0.57    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.53      0.55      0.54      1175
           2       0.38      0.07      0.12      1039
           3       0.38      0.24      0.30      2123
           4       0.43      0.22      0.29      4072
           5       0.70      0.93      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.48      0.40      0.41     19444
weighted avg       0.58      0.64      0.59     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.56      0.57      0.57     10141
           2       0.35      0.07      0.12      8226
           3       0.39      0.26      0.31     15843
           4       0.43      0.21      0.28     27955
           5       0.69      0.92      0.79     73942

    accuracy                           0.62    136107
   macro avg       0.48      0.41      0.41    136107
weighted avg       0.57      0.62      0.57    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.54      0.55      0.54      1175
           2       0.34      0.06      0.10      1039
           3       0.39      0.24      0.30      2123
           4       0.42      0.23      0.30      4072
           5       0.70      0.92      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.48      0.40      0.41     19444
weighted avg       0.58      0.64      0.59     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.57      0.57      0.57     10141
           2       0.36      0.07      0.12      8226
           3       0.39      0.27      0.32     15843
           4       0.43      0.22      0.29     27955
           5       0.69      0.92      0.79     73942

    accuracy                           0.63    136107
   macro avg       0.49      0.41      0.42    136107
weighted avg       0.57      0.63      0.58    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.55      0.54      0.55      1175
           2       0.37      0.07      0.12      1039
           3       0.39      0.26      0.31      2123
           4       0.43      0.24      0.31      4072
           5       0.71      0.93      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.49      0.41      0.42     19444
weighted avg       0.59      0.64      0.59     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.57      0.58      0.57     10141
           2       0.35      0.07      0.12      8226
           3       0.39      0.27      0.32     15843
           4       0.43      0.22      0.29     27955
           5       0.69      0.92      0.79     73942

    accuracy                           0.63    136107
   macro avg       0.49      0.41      0.42    136107
weighted avg       0.57      0.63      0.58    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.55      0.55      0.55      1175
           2       0.37      0.07      0.12      1039
           3       0.40      0.26      0.31      2123
           4       0.43      0.23      0.30      4072
           5       0.70      0.93      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.49      0.41      0.42     19444
weighted avg       0.59      0.64      0.59     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.57      0.58      0.57     10141
           2       0.35      0.08      0.13      8226
           3       0.39      0.28      0.33     15843
           4       0.43      0.22      0.29     27955
           5       0.69      0.92      0.79     73942

    accuracy                           0.63    136107
   macro avg       0.49      0.42      0.42    136107
weighted avg       0.57      0.63      0.58    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.54      0.55      0.55      1175
           2       0.36      0.07      0.12      1039
           3       0.39      0.27      0.32      2123
           4       0.44      0.23      0.30      4072
           5       0.71      0.92      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.49      0.41      0.42     19444
weighted avg       0.59      0.64      0.59     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.57      0.57      0.57     10141
           2       0.36      0.09      0.14      8226
           3       0.40      0.27      0.32     15843
           4       0.43      0.21      0.29     27955
           5       0.69      0.93      0.79     73942

    accuracy                           0.63    136107
   macro avg       0.49      0.41      0.42    136107
weighted avg       0.57      0.63      0.58    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.55      0.55      0.55      1175
           2       0.37      0.08      0.13      1039
           3       0.40      0.26      0.32      2123
           4       0.44      0.23      0.30      4072
           5       0.71      0.93      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.49      0.41      0.42     19444
weighted avg       0.59      0.64      0.59     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.58      0.58      0.58     10141
           2       0.36      0.08      0.13      8226
           3       0.40      0.29      0.33     15843
           4       0.44      0.23      0.30     27955
           5       0.70      0.92      0.79     73942

    accuracy                           0.63    136107
   macro avg       0.49      0.42      0.43    136107
weighted avg       0.58      0.63      0.58    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.56      0.56      0.56      1175
           2       0.37      0.08      0.13      1039
           3       0.39      0.26      0.31      2123
           4       0.44      0.24      0.31      4072
           5       0.71      0.93      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.49      0.41      0.42     19444
weighted avg       0.59      0.64      0.60     19444

T

  out=out, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.58      0.58      0.58     10141
           2       0.36      0.10      0.15      8226
           3       0.40      0.28      0.33     15843
           4       0.43      0.24      0.31     27955
           5       0.70      0.92      0.79     73942

    accuracy                           0.63    136107
   macro avg       0.49      0.42      0.43    136107
weighted avg       0.58      0.63      0.58    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.56      0.57      0.56      1175
           2       0.37      0.09      0.14      1039
           3       0.40      0.25      0.31      2123
           4       0.43      0.25      0.32      4072
           5       0.71      0.92      0.80     11035

    accuracy                           0.64     19444
   macro avg       0.49      0.42      0.43     19444
weighted avg       0.59      0.64      0.60     19444

T

b.	Use the already available google word2vec model. For each review take average word embeddings and train a logistic regression. Report accuracy on test set. 

c.	Use the already available glove models – 50D, 100D and 200D. For each review take average word embeddings and train a logistic regression. Report accuracy on test set for each of the three sized embeddings. 

In [11]:
def get_glove_embeddings(text):
    embeddings = []
    for word in text:
        #print(word)
        try:
            #print(embeddings_index.get(word))
            embeddings.append(embeddings_index.get(word))
        except KeyError:
            pass
    
    embeddings = [x for x in embeddings if x is not None]
    embeddings = np.mean(embeddings, axis = 0)
    embeddings = embeddings.tolist()
    return embeddings

### Glove 50D

In [12]:
import numpy as np
embeddings_index = {}
f = open('./Glove_embedding/glove.6B/glove.6B.50d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [13]:
data['embeddings'] = data['processed_text'].apply(get_glove_embeddings)

  out=out, **kwargs)


In [17]:
df3 = data.embeddings.apply(pd.Series)
df3 = df3.fillna(0)
df3['overall'] = data.overall

train = df3[0:floor(0.7*len(df3))]
validate = df3[floor(0.7*len(df3)):floor(0.8*len(df3))]
test = df3[floor(0.8*len(df3)):]

clf = LogisticRegression()
clf.fit(train.iloc[:, :50], train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(train.iloc[:, :50])))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(validate.iloc[:, :50])))
print("TEST: \n", classification_report(test.overall, clf.predict(test.iloc[:, :50])))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.44      0.27      0.33     10141
           2       0.24      0.01      0.01      8226
           3       0.32      0.07      0.12     15843
           4       0.33      0.05      0.08     27955
           5       0.58      0.96      0.72     73942

    accuracy                           0.56    136107
   macro avg       0.38      0.27      0.25    136107
weighted avg       0.47      0.56      0.45    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.42      0.26      0.32      1175
           2       0.17      0.00      0.01      1039
           3       0.30      0.06      0.10      2123
           4       0.34      0.05      0.09      4072
           5       0.60      0.97      0.74     11035

    accuracy                           0.58     19444
   macro avg       0.37      0.27      0.25     19444
weighted avg       0.48      0.58      0.47     19444

T

### Glove 100D

In [18]:
import numpy as np
embeddings_index = {}
f = open('./Glove_embedding/glove.6B/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [19]:
data['embeddings'] = data['processed_text'].apply(get_glove_embeddings)

  out=out, **kwargs)


In [21]:
df3 = data.embeddings.apply(pd.Series)
df3 = df3.fillna(0)
df3['overall'] = data.overall

train = df3[0:floor(0.7*len(df3))]
validate = df3[floor(0.7*len(df3)):floor(0.8*len(df3))]
test = df3[floor(0.8*len(df3)):]

clf = LogisticRegression()
clf.fit(train.iloc[:, :100], train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(train.iloc[:, :100])))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(validate.iloc[:, :100])))
print("TEST: \n", classification_report(test.overall, clf.predict(test.iloc[:, :100])))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.48      0.36      0.41     10141
           2       0.31      0.01      0.02      8226
           3       0.33      0.12      0.18     15843
           4       0.37      0.10      0.15     27955
           5       0.61      0.94      0.74     73942

    accuracy                           0.57    136107
   macro avg       0.42      0.31      0.30    136107
weighted avg       0.50      0.57      0.49    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.47      0.35      0.40      1175
           2       0.29      0.01      0.02      1039
           3       0.32      0.11      0.16      2123
           4       0.39      0.11      0.17      4072
           5       0.63      0.95      0.75     11035

    accuracy                           0.59     19444
   macro avg       0.42      0.31      0.30     19444
weighted avg       0.52      0.59      0.51     19444

T

### Glove 200D

In [23]:
import numpy as np
embeddings_index = {}
f = open('./Glove_embedding/glove.6B/glove.6B.200d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [24]:
data['embeddings'] = data['processed_text'].apply(get_glove_embeddings)

  out=out, **kwargs)


In [25]:
df3 = data.embeddings.apply(pd.Series)
df3 = df3.fillna(0)
df3['overall'] = data.overall

train = df3[0:floor(0.7*len(df3))]
validate = df3[floor(0.7*len(df3)):floor(0.8*len(df3))]
test = df3[floor(0.8*len(df3)):]

clf = LogisticRegression()
clf.fit(train.iloc[:, :200], train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(train.iloc[:, :200])))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(validate.iloc[:, :200])))
print("TEST: \n", classification_report(test.overall, clf.predict(test.iloc[:, :200])))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.50      0.44      0.47     10141
           2       0.31      0.03      0.05      8226
           3       0.34      0.17      0.22     15843
           4       0.39      0.12      0.18     27955
           5       0.63      0.93      0.75     73942

    accuracy                           0.59    136107
   macro avg       0.43      0.34      0.33    136107
weighted avg       0.52      0.59      0.51    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.50      0.44      0.47      1175
           2       0.28      0.02      0.04      1039
           3       0.33      0.15      0.21      2123
           4       0.41      0.13      0.20      4072
           5       0.65      0.94      0.77     11035

    accuracy                           0.60     19444
   macro avg       0.43      0.34      0.34     19444
weighted avg       0.53      0.60      0.53     19444

T

### Glove 300D

In [26]:
import numpy as np
embeddings_index = {}
f = open('./Glove_embedding/glove.6B/glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [27]:
data['embeddings'] = data['processed_text'].apply(get_glove_embeddings)

  out=out, **kwargs)


In [28]:
df3 = data.embeddings.apply(pd.Series)
df3 = df3.fillna(0)
df3['overall'] = data.overall

train = df3[0:floor(0.7*len(df3))]
validate = df3[floor(0.7*len(df3)):floor(0.8*len(df3))]
test = df3[floor(0.8*len(df3)):]

clf = LogisticRegression()
clf.fit(train.iloc[:, :300], train.overall)

print("TRAIN: \n", classification_report(train.overall, clf.predict(train.iloc[:, :300])))
print("VALIDATE: \n", classification_report(validate.overall, clf.predict(validate.iloc[:, :300])))
print("TEST: \n", classification_report(test.overall, clf.predict(test.iloc[:, :300])))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: 
               precision    recall  f1-score   support

           1       0.52      0.47      0.49     10141
           2       0.33      0.04      0.08      8226
           3       0.35      0.18      0.24     15843
           4       0.39      0.14      0.21     27955
           5       0.64      0.93      0.76     73942

    accuracy                           0.59    136107
   macro avg       0.45      0.35      0.35    136107
weighted avg       0.53      0.59      0.52    136107

VALIDATE: 
               precision    recall  f1-score   support

           1       0.53      0.46      0.49      1175
           2       0.30      0.04      0.07      1039
           3       0.33      0.16      0.22      2123
           4       0.42      0.15      0.22      4072
           5       0.66      0.94      0.77     11035

    accuracy                           0.61     19444
   macro avg       0.45      0.35      0.36     19444
weighted avg       0.54      0.61      0.54     19444

T

In [35]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,overall
0,0.108498,-0.029802,-0.04644,0.139854,-0.047337,0.167504,-0.079965,-0.084114,0.106931,0.071614,...,0.005329,-0.103794,-0.195137,0.136608,0.012111,-0.023665,-0.020162,0.0577,0.024251,4
1,0.154162,-0.02761,-0.023393,0.169202,-0.025278,0.175935,-0.067277,0.007563,0.175852,0.072314,...,0.054359,-0.105241,-0.150381,0.174147,0.037654,-0.021837,-0.002788,0.08947,-0.039015,5
2,0.175788,0.024813,-0.058356,0.097685,-0.031785,0.131528,-0.0985,-0.018782,0.191673,0.054044,...,0.053009,-0.012124,-0.085647,0.056777,0.027499,-0.03012,-0.055209,0.011686,-0.08959,5
3,0.154362,-0.066595,0.001443,0.069817,-0.056015,0.107302,-0.146141,-0.005836,0.143573,0.144131,...,0.098437,-0.068592,-0.151323,0.130152,0.021528,-0.061732,-0.022044,0.029363,0.025114,4
4,0.17191,-0.028356,-0.006452,0.101967,-0.086106,0.137953,-0.086672,-0.078432,0.083945,0.032172,...,-0.055787,-0.063381,-0.158382,0.121683,0.035825,-0.067687,-0.012062,0.062826,0.023308,5


In [61]:
from sklearn.preprocessing import OneHotEncoder
 
#Create an instance of One-hot-encoder
enc=OneHotEncoder()

#Passing encoded columns
'''
NOTE: we have converted the enc.fit_transform() method to array because the fit_transform method
of OneHotEncoder returns SpiPy sparse matrix this enables us to save space when we
have huge  number of categorical variables
'''
y_train = enc.fit_transform(train['overall'].values.reshape(-1, 1)).toarray()

In [82]:
y_train

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

In [44]:
X_train = train.iloc[:, :300].values