# Rating classification using sklearn
In this notebook we implement review rating classification model with library sklearn  

## Import

In [12]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import numpy as np
from langdetect import detect
import spacy
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

## Loading data

In [2]:
ds = pd.read_csv('yelp_review.csv')

In [3]:
def positive_negative_reviews(x):
    if x >= 4:
        return 1
    if x <= 2:
        return -1
    else:
        return np.nan

clean_ds = ds[['stars', 'text']]
del ds
clean_ds['label'] = clean_ds['stars'].apply(lambda x: positive_negative_reviews(x))
clean_ds = clean_ds[clean_ds['label'].notna()]
clean_ds = clean_ds[clean_ds['label']==1].head(2000).append(clean_ds[clean_ds['label']==-1].head(2000))
clean_ds['language'] = clean_ds['text'].apply(lambda x: detect(x))
clean_ds = clean_ds[clean_ds['language'] == 'en'] # select only english reviews
np.random.seed(42)
clean_ds = clean_ds.reindex(np.random.permutation(clean_ds.index)) # random shuffle

In [4]:
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [5]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [6]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [7]:
X = clean_ds['text'] # the features we want to analyze
ylabels = clean_ds['label'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=1/3, random_state=42)

In [8]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

## Cross validation on SVM

In [9]:
classifier = SVC()
# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])
grid = GridSearchCV(pipe, param_grid= {'classifier__kernel':['linear', 'rbf', 'poly'], 'classifier__C':[1, 10, 100, 100000]}, cv=5, scoring='accuracy', refit=True)
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cleaner',
                                        <__main__.predictors object at 0x7f02d048f210>),
                                       ('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                      

In [11]:
predicted = grid.predict(X_test)
print(confusion_matrix(y_test, predicted))
print("Support vector classifier Accuracy:",accuracy_score(y_test, predicted))
print("Support vector classifier Precision:",precision_score(y_test, predicted))
print("Support vector classifier Recall:",recall_score(y_test, predicted))

[[576  83]
 [ 77 560]]
Support vector classifier Accuracy: 0.8765432098765432
Support vector classifier Precision: 0.8709175738724728
Support vector classifier Recall: 0.8791208791208791
