In [63]:
import pandas as pd 
import numpy as np
import re
import warnings

import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn import metrics

import math
import statistics #stdev

pd.set_option("max_colwidth", 130)
pd.set_option('display.max_columns', None)
pd.set_option("display.width", 300)
warnings.filterwarnings('ignore')

### <font color = darkblue>${\textbf{Dataset}}$</font>

In [51]:
df = pd.read_csv('C:/ahamid/git_wwsssuuup/IMDB Dataset.csv')
df = df.drop_duplicates()
df.rename(columns={'sentiment' : 'label'}, inplace = True) # rename column
df = df.sample(10000) # Take a subset for faster code test
df = df.replace({'negative': 1, 'positive': 0})
df.head(1)

Unnamed: 0,review,label
28734,"it's been awhile since i've seen Cold Mountain,bit i do knew that i enjoyed it immensely.though it does take place during the ...",0


### <font color = darkblue>${\textbf{Preprocessing}}$</font>

In [52]:
#Digit Count
def digit_count(review):
    count=0
    for c in review:
        if c.isnumeric():
            count+=1
    return count


#Letter Count
def letter_count(review):
    count=0
    for c in review:
        if c.isalpha(): # The isalpha() method returns True if all the characters are alphabet letters (a-z).
            count+=1
    return count


# Check special charcters
'''Non exhaustive list'''
def checkSpecial(string): 
    regex = re.compile('[-@_!#$%^&*()<>?|}{~]') 
    i=0
    for char in string:   
        if regex.search(char):
            i+=1
    return i


# Compute entropy
def entropy(review):
        string = review.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return entropy


# Average word length
def avg_word_length(review):
    words = list(filter(None, re.split(r"\d*\W+", review)))
    average = sum(len(word) for word in words) / len(words)
    return average

In [53]:
def feature_transform(df):
    switch = {'words_avg' : avg_word_length, 'digits_count' : digit_count, 'letter_count' : letter_count,
              'specialChar' : checkSpecial, 'entropy' : entropy}
    for key in switch:
        df.insert(1, key, [switch[key](review) for review in df['review']])

# Transform Dataset(feature_transform): add computed features
feature_transform(df)

df['num.'] = df['review'].apply(lambda i: i.count('.'))

In [54]:
df = df.drop(['review'], axis = 1)

**Train test split**

In [55]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['label'])],  
                                                    df["label"], test_size = 0.1, random_state = 26)

### <font color = darkblue>${\textbf{1. multiple GridSearches loop on multiple pipelines}}$</font>

In [56]:
# Construct some pipelines
pipe_lr = Pipeline([('lr', LogisticRegression(random_state = 26))])


mnb = MultinomialNB()
pipe_nb = Pipeline([('minmax', MinMaxScaler()),
                    ('clf', mnb)])


pipe_nb_pca = Pipeline([('pca', PCA(n_components = 'mle', svd_solver = 'full')), 
                        ('minmax', MinMaxScaler()),
                        ('clf', mnb)])

pipe_nb_scl = Pipeline([('scl', StandardScaler()), 
                        ('minmax', MinMaxScaler()),                      
                        ('clf', mnb)])


pipe_nb_scl_pca = Pipeline([('scl', StandardScaler()),
                            ('pca', PCA(n_components = 'mle', svd_solver = 'full')), 
                            ('minmax', MinMaxScaler()),
                            ('clf', mnb)])

In [57]:
# Set grid search params
param_range = [9, 10]
param_range_fl = [1.0, 0.5, 0.1]

grid_params_lr = [{'lr__penalty': ['l1', 'l2'],
                   'lr__C': param_range_fl,
                   'lr__solver': ['liblinear']}] 

grid_params_nb = [{'clf__alpha': [0.0, 1.0],
                   'clf__fit_prior': ['True', 'False']}]

In [58]:
# Construct grid searches
jobs = -1

gs_lr = GridSearchCV(estimator = pipe_lr, param_grid = grid_params_lr, scoring='accuracy', n_jobs=jobs)
gs_nb = GridSearchCV(estimator = pipe_nb, param_grid = grid_params_nb, scoring='accuracy', n_jobs=jobs)
gs_nb_pca = GridSearchCV(estimator = pipe_nb_pca, param_grid = grid_params_nb, scoring='accuracy', n_jobs=jobs)
gs_nb_scl = GridSearchCV(estimator = pipe_nb_scl, param_grid = grid_params_nb, scoring='accuracy', n_jobs=jobs)
gs_nb_scl_pca = GridSearchCV(estimator = pipe_nb_scl_pca, param_grid = grid_params_nb, scoring='accuracy', n_jobs=jobs)

# List of pipelines for ease of iteration
grids = [gs_lr, gs_nb, gs_nb_pca, gs_nb_scl, gs_nb_scl_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic regression', 1: 'Naive Bayes', 2: 'Naive Bayes PCA', 3: 'Naive Bayes SCL', 4: 'Naive Bayes PCA SCL'}

In [60]:
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
        # Fit grid search
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx

print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

Performing model optimizations...

Estimator: Logistic regression
Best params: {'lr__C': 1.0, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
Best training accuracy: 0.546
Test set accuracy score for best params: 0.551 

Estimator: Naive Bayes
Best params: {'clf__alpha': 0.0, 'clf__fit_prior': 'True'}
Best training accuracy: 0.508
Test set accuracy score for best params: 0.536 

Estimator: Naive Bayes PCA
Best params: {'clf__alpha': 0.0, 'clf__fit_prior': 'True'}
Best training accuracy: 0.504
Test set accuracy score for best params: 0.508 

Estimator: Naive Bayes SCL
Best params: {'clf__alpha': 0.0, 'clf__fit_prior': 'True'}
Best training accuracy: 0.508
Test set accuracy score for best params: 0.536 

Estimator: Naive Bayes PCA SCL
Best params: {'clf__alpha': 1.0, 'clf__fit_prior': 'True'}
Best training accuracy: 0.505
Test set accuracy score for best params: 0.507 

Classifier with best test set accuracy: Logistic regression


### <font color = darkblue>${\textbf{2. Stacking multiple classifiers and a pipeline on a metamodel in addition to Grid Search}}$</font>

In [64]:
# Declare some classifiers and a pipeline
clf1 = RandomForestClassifier()

clf2 = Pipeline([('pca', PCA(n_components = 'mle', svd_solver = 'full')), 
                 ('minmax', MinMaxScaler()),
                 ('clf', mnb)])

clf3 = xgb.XGBClassifier(learning_rate=0.1)

# LR
lr = LogisticRegression()

In [65]:
# Stacking the classifiers
sclf = StackingClassifier(classifiers = [clf1, clf2, clf3], 
                          meta_classifier = lr)

In [66]:
# Grid Search parameters
params = {'randomforestclassifier__n_estimators': [10, 25, 40, 50]}
# Declare GridSearchCV
grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    verbose = 4,
                    n_jobs = -1,
                    refit=True)
# Fir GS
grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(estimator=StackingClassifier(classifiers=[RandomForestClassifier(),
                                                       Pipeline(steps=[('pca',
                                                                        PCA(n_components='mle',
                                                                            svd_solver='full')),
                                                                       ('minmax',
                                                                        MinMaxScaler()),
                                                                       ('clf',
                                                                        MultinomialNB())]),
                                                       XGBClassifier(base_score=None,
                                                                     booster=None,
                                                                     callbacks=None,
                                                   

In [68]:
# Metrics
cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))
    
    
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

0.523 +/- 0.00 {'randomforestclassifier__n_estimators': 10}
0.517 +/- 0.01 {'randomforestclassifier__n_estimators': 25}
0.524 +/- 0.00 {'randomforestclassifier__n_estimators': 40}
0.522 +/- 0.00 {'randomforestclassifier__n_estimators': 50}
Best parameters: {'randomforestclassifier__n_estimators': 40}
Accuracy: 0.52


In [69]:
y_pred = grid.predict(X_test)
# Test data accuracy of model with best params
print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
# Track best (highest test accuracy) model
if accuracy_score(y_test, y_pred) > best_acc:
    best_acc = accuracy_score(y_test, y_pred)
    best_gs = gs
    best_clf = idx

Test set accuracy score for best params: 0.538 
