In [2]:
import pandas as pd
import seaborn as sns 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline 


### Linear SVM (Stochastic Gradient Descent)

A stochastic gradient descent classifier is a linear classifier that is optimized by the stochastic gradient descent.

In [7]:
clf_data = pd.read_csv('complete_project_data.csv')
clf_data

Unnamed: 0.1,Unnamed: 0,rating,cleaned_review
0,0,6,helps keep attention class seems make keep gri...
1,1,10,medicine amazing used always last one turn tes...
2,2,4,hyper focused dry mouth straterra put state ma...
3,3,8,adhd taking adderall years take morning someti...
4,4,9,keeps focused however wears sleep till hours l...
...,...,...,...
215026,215026,6,first day took medicine disturbed suddenly not...
215027,215027,5,medication somewhat helpful stiffness pain sti...
215028,215028,1,prescribed mg pain right shoulder collar bone ...
215029,215029,9,helpful frozen shoulder pain side effects


In [8]:
# 
x = clf_data['cleaned_review']
y = clf_data['rating']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.30, random_state=0)

In [9]:
# Classification Pipeline

sgd_clf_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

In [10]:
# Fitting + Prediction
sgd_clf_pipeline.fit(x_train.values.astype('U'), y_train)
sgd_prediction = sgd_clf_pipeline.predict(x_test.values.astype('U'))

In [13]:
# Accuracy + Classification Report
print('===============Stochastic Gradient Descent===============')
print(classification_report(y_test, sgd_prediction))

              precision    recall  f1-score   support

           1       0.49      0.63      0.55      8779
           2       0.20      0.08      0.12      2741
           3       0.25      0.09      0.13      2558
           4       0.21      0.12      0.15      1996
           5       0.26      0.11      0.16      3257
           6       0.22      0.10      0.14      2513
           7       0.26      0.11      0.15      3838
           8       0.30      0.14      0.19      7683
           9       0.28      0.12      0.17     10911
          10       0.47      0.88      0.61     20234

    accuracy                           0.42     64510
   macro avg       0.30      0.24      0.24     64510
weighted avg       0.36      0.42      0.35     64510



As training set size grows, accuracy increases. Displayed in rating 10.


### Naive Bayes 

In [15]:
# Bernoulli Naive Bayes Classifier
bern_clf = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', BernoulliNB()),
               ])

# Fitting + Prediction
bern_clf.fit(x_train.values.astype('U'),y_train)
bern_prediction = bern_clf.predict(x_test.values.astype('U'))

# Accuracy + Classification Report
print('================= Bernoulli Naive Bayes =================')
print(classification_report(y_test,bern_prediction))

              precision    recall  f1-score   support

           1       0.47      0.66      0.55      8779
           2       0.31      0.11      0.16      2741
           3       0.28      0.11      0.16      2558
           4       0.42      0.05      0.09      1996
           5       0.24      0.16      0.19      3257
           6       0.48      0.04      0.07      2513
           7       0.33      0.09      0.14      3838
           8       0.31      0.25      0.28      7683
           9       0.34      0.33      0.33     10911
          10       0.50      0.74      0.60     20234

    accuracy                           0.43     64510
   macro avg       0.37      0.25      0.26     64510
weighted avg       0.40      0.43      0.39     64510



In [68]:
# Gaussian Naive Bayes Classifier
gauss_nb = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', GaussianNB()),
               ])

# Fitting + Prediction
gauss_nb.fit(x_train.values.astype('U'),y_train)
gauss_prediction = gauss_nb.predict(x_test.values.astype('U'))

# Accuracy + Classification Report
print(classification_report(y_test,gauss_prediction))

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
# Multinomial Naive Bayes Classifier
mn_nb =Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', MultinomialNB()),
               ])

# Fitting + Prediction
mn_nb.fit(x_train.values.astype('U'),y_train)
mn_prediction = mn_nb.predict(x_test.values.astype('U'))

# Accuracy + Classification Report
print(classification_report(y_test,mn_prediction, zero_division=0))


              precision    recall  f1-score   support

           1       0.75      0.12      0.20      2173
           2       0.00      0.00      0.00       718
           3       0.00      0.00      0.00       658
           4       0.00      0.00      0.00       509
           5       0.00      0.00      0.00       831
           6       0.00      0.00      0.00       647
           7       0.00      0.00      0.00       913
           8       0.62      0.00      0.01      1830
           9       0.39      0.01      0.01      2715
          10       0.33      1.00      0.49      5134

    accuracy                           0.33     16128
   macro avg       0.21      0.11      0.07     16128
weighted avg       0.34      0.33      0.19     16128

