In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

## Loading TSV file

In [3]:
df_amazon = pd.read_csv ("/content/amazon_alexa.tsv", sep="\t")# Top 5 records
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


## View data information

In [4]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [5]:
df_amazon.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [6]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# stopwords
stop_words = STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [7]:
class predictors(TransformerMixin):
    
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [8]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [9]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [10]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels = df_amazon['feedback'] 

# the labels, or answers, we want to test against
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

## 1. Logistic Regression Classifier

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7fe1a40f8890>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x7fe2292c3ef0>)),
                ('classifier', LogisticRegression())])

In [13]:
from sklearn.metrics import accuracy_score,classification_report

In [16]:
pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      0.71      0.83       182
           1       0.97      1.00      0.99      2023

    accuracy                           0.98      2205
   macro avg       0.99      0.85      0.91      2205
weighted avg       0.98      0.98      0.97      2205



## 2. SGD

In [19]:

from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))



              precision    recall  f1-score   support

           0       1.00      0.91      0.95       182
           1       0.99      1.00      1.00      2023

    accuracy                           0.99      2205
   macro avg       1.00      0.96      0.98      2205
weighted avg       0.99      0.99      0.99      2205



## 3. Decision Tree Classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95       182
           1       0.99      1.00      1.00      2023

    accuracy                           0.99      2205
   macro avg       1.00      0.96      0.98      2205
weighted avg       0.99      0.99      0.99      2205



## 4. KNeighbors Classifier

In [21]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       0.65      0.12      0.20       182
           1       0.93      0.99      0.96      2023

    accuracy                           0.92      2205
   macro avg       0.79      0.56      0.58      2205
weighted avg       0.90      0.92      0.90      2205



## 5. Random Forest Classifier

In [22]:

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95       182
           1       0.99      1.00      1.00      2023

    accuracy                           0.99      2205
   macro avg       1.00      0.96      0.98      2205
weighted avg       0.99      0.99      0.99      2205



## 6. AdaBoost Classifier

In [23]:

from sklearn.ensemble import AdaBoostClassifier

classifier = AdaBoostClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       0.86      0.45      0.59       182
           1       0.95      0.99      0.97      2023

    accuracy                           0.95      2205
   macro avg       0.91      0.72      0.78      2205
weighted avg       0.95      0.95      0.94      2205



## 7. ExtraTreesClassifier

In [24]:
from sklearn.ensemble import ExtraTreesClassifier

classifier = ExtraTreesClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95       182
           1       0.99      1.00      1.00      2023

    accuracy                           0.99      2205
   macro avg       1.00      0.96      0.98      2205
weighted avg       0.99      0.99      0.99      2205



## 8. GradientBoosting Classifier

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

classifier = GradientBoostingClassifier()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      0.44      0.61       182
           1       0.95      1.00      0.98      2023

    accuracy                           0.95      2205
   macro avg       0.98      0.72      0.79      2205
weighted avg       0.96      0.95      0.95      2205



## 9. XGboost

In [26]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      0.25      0.40       182
           1       0.94      1.00      0.97      2023

    accuracy                           0.94      2205
   macro avg       0.97      0.62      0.68      2205
weighted avg       0.94      0.94      0.92      2205



## 10. SVC

In [27]:
from sklearn.svm import SVC
classifier = SVC()
# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

pred_train=pipe.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      0.57      0.73       182
           1       0.96      1.00      0.98      2023

    accuracy                           0.96      2205
   macro avg       0.98      0.79      0.85      2205
weighted avg       0.97      0.96      0.96      2205

