In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('EcoPreprocessed.csv')

In [3]:
# Lowercase the reviews
df['review'] = df['review'].str.lower()

# Remove punctuation
df['review'] = df['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Tokenize the reviews
df['review'] = df['review'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatize the words
lemmatizer = WordNetLemmatizer()
df['review'] = df['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

print(df['review'])

0                                 able play youtube alexa
1       able recognize indian accent really well drop ...
2       absolute smart device amazon connect external ...
3       absolutely amaze new member family control hom...
4       absolutely amaze previously sceptical invest m...
                              ...                        
4079    yo yo yo love go want one smart speaker value ...
4080                                        youtube music
4081    youtube support nahi kartasong recognise achha...
4082    yup proscontrols wipro light amazinglysony bra...
4083    zero integration capability fire tv device use...
Name: review, Length: 4084, dtype: object


In [4]:
labels = df['division']
text = df['review']

In [15]:
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

for train_index, test_index in shuffle_split.split(text):
    X_train, X_test = text[train_index], text[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

#X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=0)
#X_train

In [70]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3267,)
(817,)
(3267,)
(817,)


In [16]:
countvec = CountVectorizer(ngram_range = (1, 2), min_df = 2)
X_train = countvec.fit_transform(X_train)
X_test = countvec.transform(X_test)

In [72]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3267, 4669)
(817, 4669)
(3267,)
(817,)


In [11]:
from sklearn.linear_model import LogisticRegression
grid = {
    'C': np.logspace(-3, 3, 7),
    'penalty': ['l1', 'l2'],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],  # Adding solvers
    'class_weight': ['balanced',None]
}

logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=5)
logreg_cv.fit(X_train,y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 10.0, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy : 0.8809353208667595


In [13]:
y_pred = logreg_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.8776009791921665

In [18]:

log_reg = LogisticRegression()
model1 = log_reg.fit(X_train, y_train)
print("Training accuracy ",log_reg.score(X_train,y_train))

y_pred = log_reg.predict(X_test)
print("Testing accuracy",accuracy_score(y_test, y_pred))

Training accuracy  0.982858891949801
Testing accuracy 0.8788249694002448


In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {'max_features': ['sqrt', 'log2', None],
              'ccp_alpha': [0.1, .01, .001, 0],
              'max_depth' : [3,10,50,100,None],
              'criterion' :['gini', 'entropy']
             }
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5, verbose=True)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [38]:
final_model = grid_search.best_estimator_
final_model

In [39]:

print("tuned hpyerparameters :(best parameters) ",grid_search.best_params_)
print("accuracy :",grid_search.best_score_)

tuned hpyerparameters :(best parameters)  {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 100, 'max_features': None}
accuracy : 0.8833789941507322


In [41]:
y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

0.8702570379436965

In [40]:
tree_op=DecisionTreeClassifier(ccp_alpha=0.001,
                       random_state=1024, criterion='gini',max_depth=100)
tree_op.fit(X_train,y_train)
print("Training accuracy ",tree_op.score(X_train,y_train))

y_pred = tree_op.predict(X_test)
print("Testing accuracy",accuracy_score(y_test, y_pred))

Training accuracy  0.9081726354453628
Testing accuracy 0.8702570379436965


In [33]:

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
print("Training accuracy ",tree.score(X_train,y_train))

y_pred = tree.predict(X_test)
print("Testing accuracy",accuracy_score(y_test, y_pred))

Training accuracy  0.9923477196204469
Testing accuracy 0.8384332925336597


In [81]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
model3=knn.fit(X_train, y_train)
knn.score(X_train,y_train)

0.7949188858279768

In [82]:
review = ['This is a bad product']
review = countvec.transform(review)
print(tree.predict(review))

['negative']
