In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB

In [2]:
dataset = pd.read_csv("preprocessed_dataset.csv")
preprocessed_reviews = dataset["review"]

# Text vectorization

## Bag of words

In [8]:
count_vect = CountVectorizer() #in scikit-learn
count_vect.fit(preprocessed_reviews)
print("some feature names ", count_vect.get_feature_names()[:10])
print('='*50)

Bow_vectors = count_vect.transform(preprocessed_reviews)
print("the type of count vectorizer ",type(Bow_vectors))
print("the shape of out text BOW vectorizer ",Bow_vectors.get_shape())
print("the number of unique words ",Bow_vectors.get_shape()[1])

some feature names  ['aa', 'aaa', 'aaaaaaaaaaaahhhhhhhhhhhhhh', 'aaaaaaaargh', 'aaaaaaah', 'aaaaaaahhhhhhggg', 'aaaaagh', 'aaaaah', 'aaaaahhhh', 'aaaaargh']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (50000, 101871)
the number of unique words  101871


## Tfidf vectorizer

In [3]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(preprocessed_reviews)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

final_tf_idf = tf_idf_vect.transform(preprocessed_reviews)
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['aa', 'aaa', 'aag', 'aaliyah', 'aames', 'aamir', 'aamir khan', 'aardman', 'aaron', 'aaron carter']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (50000, 80839)
the number of unique words including both unigrams and bigrams  80839


# Concept 1 - Naves bayes

# Algo 1 - MultinomialNB 

### Bag of words

In [17]:
X_train_br, X_test_br, Y_train_br, Y_test_br = train_test_split(Bow_vectors, dataset['sentiment'], test_size=0.25, random_state=5)

In [18]:
MNB_BOW = MultinomialNB()
MNB_BOW.fit(X_train_br,Y_train_br)

MultinomialNB()

In [19]:
predicted = MNB_BOW.predict(X_test_br)
accuracy_score = metrics.accuracy_score(predicted,Y_test_br)

In [20]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

86.34%


In [21]:
nb = MultinomialNB()
parameters = {'alpha':[0.00001,0.0005, 0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,5,10,50,100]}
clf = GridSearchCV(nb, parameters, cv=3, scoring='accuracy',return_train_score=True)
clf.fit(X_train_br,Y_train_br)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.sort_values(by="rank_test_score").head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
8,0.162306,0.002113,0.070021,0.000774,0.5,{'alpha': 0.5},0.85536,0.85984,0.85512,0.856773,0.002171,1,0.9232,0.92128,0.92188,0.92212,0.000802
9,0.179413,0.003457,0.069601,0.006317,1.0,{'alpha': 1},0.85536,0.85944,0.85488,0.85656,0.002046,2,0.91496,0.91464,0.91492,0.91484,0.000142
10,0.209964,0.042985,0.083813,0.013644,5.0,{'alpha': 5},0.85368,0.85784,0.85328,0.854933,0.002062,3,0.89444,0.8944,0.89492,0.894587,0.000236
7,0.17605,0.015154,0.065146,0.001995,0.1,{'alpha': 0.1},0.85256,0.8572,0.85432,0.854693,0.001913,4,0.93712,0.9374,0.93624,0.93692,0.000494


We can say best value for alpha hyperparameter is 0.5

In [22]:
MNB_BOW = MultinomialNB(alpha=0.5)
MNB_BOW.fit(X_train_br,Y_train_br)
predicted = MNB_BOW.predict(X_test_br)
accuracy_score = metrics.accuracy_score(predicted,Y_test_br)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

86.38%


### Tfidf vector

In [4]:
X_train_tfidf, X_test_tfidf, Y_train_tfidf, Y_test_tfidf = train_test_split(final_tf_idf, dataset['sentiment'], test_size=0.25, random_state=5)

In [24]:
MNB_TFIDF = MultinomialNB()
MNB_TFIDF.fit(X_train_tfidf,Y_train_tfidf)

MultinomialNB()

In [25]:
predicted = MNB_TFIDF.predict(X_test_tfidf)
accuracy_score = metrics.accuracy_score(predicted,Y_test_tfidf)

In [26]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

89.10%


In [27]:
nb = MultinomialNB()
parameters = {'alpha':[0.00001,0.0005, 0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,5,10,50,100]}
clf = GridSearchCV(nb, parameters, cv=3, scoring='accuracy',return_train_score=True)
clf.fit(X_train_tfidf,Y_train_tfidf)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.sort_values(by="rank_test_score").head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
7,0.198195,0.002413,0.062212,0.002013,0.1,{'alpha': 0.1},0.88368,0.88608,0.88256,0.884107,0.001468,1,0.95424,0.9536,0.95348,0.953773,0.000334
5,0.205386,0.019352,0.066733,0.004912,0.05,{'alpha': 0.05},0.88232,0.88568,0.8816,0.8832,0.001778,2,0.95744,0.95696,0.95672,0.95704,0.000299
8,0.194891,0.021734,0.064944,0.003883,0.5,{'alpha': 0.5},0.88144,0.88488,0.88288,0.883067,0.001411,3,0.9416,0.9416,0.94112,0.94144,0.000226
9,0.189951,0.005836,0.064339,0.003978,1.0,{'alpha': 1},0.88096,0.88472,0.88296,0.88288,0.001536,4,0.93368,0.93316,0.934,0.933613,0.000346


We can say best value for alpha hyperparameter is 0.1

In [28]:
MNB_TFIDF = MultinomialNB(alpha=0.1)
MNB_TFIDF.fit(X_train_tfidf,Y_train_tfidf)
predicted = MNB_TFIDF.predict(X_test_tfidf)
accuracy_score = metrics.accuracy_score(predicted,Y_test_tfidf)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

89.10%


## Algo 2 - BERNOULI ALGORITHM

### Bag of words

In [30]:
BER_BOW = BernoulliNB()
BER_BOW.fit(X_train_br, Y_train_br)

BernoulliNB()

In [31]:
accuracy_score = metrics.accuracy_score(BER_BOW.predict(X_test_br),Y_test_br)
print(str('{:4.2f}'.format(accuracy_score*100))+'%')

85.78%


In [32]:
nb = BernoulliNB()
parameters = {'alpha':[0.00001,0.0005, 0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,5,10,50,100]}
clf = GridSearchCV(nb, parameters, cv=3, scoring='accuracy',return_train_score=True)
clf.fit(X_train_br,Y_train_br)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.sort_values(by="rank_test_score").head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
8,0.306204,0.106303,0.13291,0.026773,0.5,{'alpha': 0.5},0.84872,0.8576,0.84952,0.851947,0.004011,1,0.92248,0.92084,0.91936,0.920893,0.001274
9,0.237554,0.0194,0.116294,0.009524,1.0,{'alpha': 1},0.84928,0.85656,0.84792,0.851253,0.003793,2,0.91328,0.9118,0.91012,0.911733,0.001291
7,0.325969,0.049943,0.19637,0.03026,0.1,{'alpha': 0.1},0.84776,0.85424,0.84936,0.850453,0.002756,3,0.93932,0.93908,0.93656,0.93832,0.001248
5,0.208242,0.02584,0.103955,0.013311,0.05,{'alpha': 0.05},0.84648,0.85216,0.8476,0.848747,0.002457,4,0.94392,0.94444,0.94176,0.943373,0.00116


In [33]:
BN_BOW = BernoulliNB(alpha=0.5)
BN_BOW.fit(X_train_br,Y_train_br)
predicted = BN_BOW.predict(X_test_br)
accuracy_score = metrics.accuracy_score(predicted,Y_test_br)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

85.68%


### Tfidf vector

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(final_tf_idf, dataset['sentiment'], test_size=0.25, random_state=5)

In [35]:
BER_tfidf = BernoulliNB()
BER_tfidf.fit(X_train, Y_train)

BernoulliNB()

In [36]:
accuracy_score = metrics.accuracy_score(BER_tfidf.predict(X_test),Y_test)
print(str('{:4.2f}'.format(accuracy_score*100))+'%')

88.62%


In [37]:
nb =  BernoulliNB()
parameters = {'alpha':[0.00001,0.0005, 0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,5,10,50,100]}
clf = GridSearchCV(nb, parameters, cv=3, scoring='accuracy',return_train_score=True)
clf.fit(X_train_tfidf,Y_train_tfidf)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.sort_values(by="rank_test_score").head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
8,0.372793,0.033801,0.158162,0.027643,0.5,{'alpha': 0.5},0.87992,0.88472,0.88104,0.881893,0.00205,1,0.93928,0.93848,0.93844,0.938733,0.000387
9,0.364385,0.055367,0.168409,0.081162,1.0,{'alpha': 1},0.87856,0.88408,0.88096,0.8812,0.00226,2,0.93588,0.93436,0.93376,0.934667,0.000892
7,0.283094,0.019319,0.163249,0.041871,0.1,{'alpha': 0.1},0.87832,0.88376,0.88088,0.880987,0.002222,3,0.9454,0.94484,0.94496,0.945067,0.000241
5,0.337699,0.015751,0.124636,0.011726,0.05,{'alpha': 0.05},0.87768,0.88304,0.87976,0.88016,0.002206,4,0.94664,0.94644,0.94696,0.94668,0.000214


In [38]:
BN_TFIDF =  BernoulliNB(alpha=0.5)
BN_TFIDF.fit(X_train_tfidf,Y_train_tfidf)
predicted = BN_TFIDF.predict(X_test_tfidf)
accuracy_score = metrics.accuracy_score(predicted,Y_test_tfidf)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

88.62%


# Concept 2 - Support Vector machine(Linear SVM)

### Bag of words

In [40]:
SVM_br = SVC(random_state=0)
SVM_br.fit(X_train_br, Y_train_br)

SVC(random_state=0)

In [41]:
predicted = SVM_br.predict(X_test_br)
accuracy_score = metrics.accuracy_score(predicted,Y_test_br)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

88.28%


In [18]:
nb = SVC()
param_grid = {'C': [0.1, 1, 10]} 
clf = GridSearchCV(nb, param_grid, cv=3, scoring='accuracy',return_train_score=True)
clf.fit(X_train_br,Y_train_br)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.sort_values(by="rank_test_score").head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
2,5091.270832,3587.157068,303.814949,22.576396,10.0,{'C': 10},0.87536,0.87888,0.87968,0.877973,0.001877,1,0.99908,0.9992,0.99936,0.999213,0.000115
1,1721.834738,171.07588,299.660156,31.879121,1.0,{'C': 1},0.87296,0.87496,0.87664,0.874853,0.001504,2,0.9628,0.9624,0.9626,0.9626,0.000163
0,779.521436,24.487984,397.687937,44.327019,0.1,{'C': 0.1},0.79952,0.806,0.80576,0.80376,0.003,3,0.8288,0.82592,0.8246,0.82644,0.001754


In [19]:
SVM_BOW = SVC(C=10)
SVM_BOW.fit(X_train_br,Y_train_br)
predicted = SVM_BOW.predict(X_test_br)
accuracy_score = metrics.accuracy_score(predicted,Y_test_br)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

88.60%


### TFIDF method

In [13]:
SVM_tfidf = SVC(random_state=0)
SVM_tfidf.fit(X_train_tfidf, Y_train_tfidf)

SVC(random_state=0)

In [14]:
predicted = SVM_tfidf.predict(X_test_tfidf)
accuracy_score = metrics.accuracy_score(predicted,Y_test_tfidf)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

90.84%


In [5]:
SVM_TFIDF = SVC(C=10)
SVM_TFIDF.fit(X_train_tfidf,Y_train_tfidf)
predicted = SVM_TFIDF.predict(X_test_tfidf)
accuracy_score = metrics.accuracy_score(predicted,Y_test_tfidf)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

91.26%
