In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

from subprocess import check_output
from sklearn.utils import shuffle

import matplotlib.pyplot as plt

import seaborn as sns

In [None]:
#Loading the dataset
data = pd.read_excel('Book2.xlsx')
datadata = shuffle(data)

In [None]:
sns.countplot(x='rating',data=data, palette='hls')
plt.show()
plt.savefig('count_plot')

In [None]:
X = data["review_comment"]
Y = data["rating"]

In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,                  # predictors
                                                    Y,                  # labels
                                                    test_size=1/3,      # test set size 
                                                    random_state=0)     # set random number generator seed for reproducibility 

print(X_train.shape)
print(X_test.shape)

In [None]:
# Setup pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
pipe = Pipeline([('tfidf', TfidfVectorizer()), 
                 ('clf', GradientBoostingClassifier(random_state=42))])

In [None]:
param1 = dict(tfidf__ngram_range=[(1, 1), (1, 2), (1, 3)],
        tfidf__min_df=[1, 2],
        tfidf__stop_words=[None, "english"],
        tfidf__smooth_idf=[False, True],
        tfidf__use_idf=[False, True],
        tfidf__sublinear_tf=[False, True],
        tfidf__binary=[False, True],
        )

param2 = dict(classifier__C = [10**-2, 10**-1, 10**0, 10**1, 10**2],
              classifier__penalty = ['l1', 'l2'])

In [None]:
params = [param1, param2]

In [None]:
# Instantiate GridSearchCV
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=2).fit(X_train, y_train)

In [None]:
# Performance on train and test sets
print('Training set score: ' + str(grid.score(X_train, y_train)))
print('Test set score: ' + str(grid.score(X_test, y_test)))

In [None]:
# Access the best set of parameters
best_params = grid.best_params_
print(best_params)

# Stores the optimum model in best_estimator
best_estimator = grid.best_estimator_
print(best_estimator)

In [None]:
y_pred2 = best_estimator.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred2))

In [None]:
>>> docs_new = ['Very unhappy with the place']

>>> predict_new = best_estimator.predict(docs_new)

>>> for doc, category in zip(docs_new, predict_new):
...     print('%r => %s' % (doc, predict_new))