# Import Libraries

In [1]:
import os
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_validate, GridSearchCV

# Import files and convert to csv

## Training Data

In [2]:
inpath = "./aclImdb/train/"
outpath = "./"
name="training data"

text = []
rating = []

for filename in os.listdir(inpath+"pos"):
    data = open(inpath+"pos/"+filename,'r', encoding = "ISO-8859-1").read()
    text.append(data)
    rating.append("1")

for filename in os.listdir(inpath+"neg"):
    data = open(inpath+"neg/"+filename,'r', encoding = "ISO-8859-1").read()
    text.append(data)
    rating.append("0")

dataset = list(zip(text,rating))

data_train = pd.DataFrame(data=dataset, columns=['Review',"Rating"])

data_train.to_csv(outpath+name, header=True)

data_train.head()

Unnamed: 0,Review,Rating
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


## Testing Data

In [3]:
inpath = "./aclImdb/test/"
outpath = "./"
name="testing data"

text = []
rating = []

for filename in os.listdir(inpath+"pos"):
    data = open(inpath+"pos/"+filename,'r', encoding = "ISO-8859-1").read()
    text.append(data)
    rating.append("1")

for filename in os.listdir(inpath+"neg"):
    data = open(inpath+"neg/"+filename,'r', encoding = "ISO-8859-1").read()
    text.append(data)
    rating.append("0")

dataset = list(zip(text,rating))

data_test = pd.DataFrame(data=dataset, columns=['Review',"Rating"])

data_test.to_csv(outpath+name, header=True)

data_test.head()

Unnamed: 0,Review,Rating
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


# Data Preprocessing

## Remove HTML format text

In [4]:
def cleanhtml(raw_data):
    cleaner = re.compile('<.*?>')
    clean_text = re.sub(cleaner, '', raw_data)
    return clean_text

data_train["Review_NoHTML"] = data_train["Review"].apply(lambda z:cleanhtml(z))
data_test["Review_NoHTML"] = data_test["Review"].apply(lambda z:cleanhtml(z))

## Remove Punctuation

In [5]:
def remove_punc(text):
    text_nopunc = "".join([char for char in text if char not in string.punctuation])
    return text_nopunc

data_train["Review_NoPunc"] = data_train["Review_NoHTML"].apply(lambda z:remove_punc(z))
data_test["Review_NoPunc"] = data_test["Review_NoHTML"].apply(lambda z:remove_punc(z))

## Word Tokenize

In [6]:
def tokenize(text):
    tokens = re.split("\W+",text)
    return(tokens)

data_train["Review_tokenized"] = data_train["Review_NoPunc"].apply(lambda x:tokenize(x.lower()))
data_test["Review_tokenized"] = data_test["Review_NoPunc"].apply(lambda x:tokenize(x.lower()))

## Remove Stop Words

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_words):
    text = [word for word in tokenized_words if word not in stopwords]
    return text

data_train["Review_NoStop"] = data_train["Review_tokenized"].apply(lambda x:remove_stopwords(x))
data_test["Review_NoStop"] = data_test["Review_tokenized"].apply(lambda x:remove_stopwords(x))

## Stemmatization

In [8]:
ps = nltk.PorterStemmer()
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return (text)

data_train["Review_Stemmed"] = data_train["Review_NoStop"].apply(lambda x:stemming(x))
data_test["Review_Stemmed"] = data_test["Review_NoStop"].apply(lambda x:stemming(x))

## Lemmatization

In [9]:
wn = nltk.WordNetLemmatizer()
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data_train["Review_Lemmatized"] = data_train["Review_Stemmed"].apply(lambda x:lemmatizing(x))
data_test["Review_Lemmatized"] = data_test["Review_Stemmed"].apply(lambda x:lemmatizing(x))

# TF-IDF 

In [10]:
#Convert to Array for tfidfVectorizor
data_train['Review_Stemmed']=[" ".join(review) for review in data_train['Review_Stemmed'].values]
data_test['Review_Stemmed']=[" ".join(review) for review in data_test['Review_Stemmed'].values]

#Split Training and Testing Data
X_train = data_train.loc[:, 'Review_Stemmed']
y_train = data_train.loc[:, 'Rating']
X_test = data_test.loc[:, 'Review_Stemmed']
y_test = data_test.loc[:, 'Rating']

In [11]:
vectorizer = TfidfVectorizer()
X_train_idf = vectorizer.fit_transform(X_train)
print("Shape: ", X_train_idf.shape)

X_test_idf = vectorizer.transform(X_test)
print("Shape: ", X_test_idf.shape)

Shape:  (25001, 113484)
Shape:  (25000, 113484)


# Classification Models: Logistic Regression, Random Forest and Gradient Boosting

In [12]:
final_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', LogisticRegression())
])
final_pipeline.steps

[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('clf',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False))]

In [13]:
clfs = []
clfs.append(LogisticRegression())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier(loss='deviance',warm_start=True))

for classifier in clfs:
    final_pipeline.set_params(clf = classifier)
    scores = cross_validate(final_pipeline, X_train, y_train)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())



---------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
-----------------------------------
fit_time  mean  2.4235368569691977
fit_time  std  0.09757571344076234
score_time  mean  0.9156997203826904
score_time  std  0.05111293198619878
test_score  mean  0.8508061992746088
test_score  std  0.00489996773907433
train_score  mean  0.9390025230452914
train_score  std  0.0031482003095237828




---------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
-----------------------------------
fit_time  mean  5.4339102904001875
fit_time  std  0.18663581086875983
score_time  mean  1.0457820097605388
score_time  std  0.023620345736884395
test_score  mean  0.7295308419008227
test_score  std  0.00047669903814949634
train_score  mean  0.9941202459848091
train_score  std  0.00038865837156440474




---------------------------------
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=True)
-----------------------------------
fit_time  mean  55.53080463409424
fit_time  std  1.5906317322703347
score_time  mean  0.9438680013020834
score_time  std  0.07045452828366189
test_score  mean  0.8051679830572153
test_score  std  0.0034147129528237807
train_score  mean  0.8340266563999174
train_score  std  0.0007063492837913111


# Hyperparameter Tuning using GridSearchCV and Classification Report

## Logistic Regression

In [14]:
final_pipeline.set_params(clf = LogisticRegression())
parameters_lr = {'clf__C':[1, 0.1,0.01], 'clf__penalty':['l1','l2'], 'clf__warm_start':[True,False], 'clf__max_iter':[100,150,200]}
gs_clf_lr = GridSearchCV(final_pipeline, param_grid=parameters_lr, n_jobs=-1,verbose=1)
gs_clf_lr = gs_clf_lr.fit(X_train, y_train)
print("Best Score: ", gs_clf_lr.best_score_)
print("Best Paramters: ", gs_clf_lr.best_params_)

y_pred_lr = gs_clf_lr.predict(X_test)
print(classification_report(y_test,y_pred_lr))

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  5.2min finished


Best Score:  0.8560457581696732
Best Paramters:  {'clf__C': 1, 'clf__max_iter': 100, 'clf__penalty': 'l1', 'clf__warm_start': True}
              precision    recall  f1-score   support

           0       0.88      0.86      0.87     12500
           1       0.86      0.89      0.88     12500

   micro avg       0.87      0.87      0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



## Random Forest Classifier

In [15]:
final_pipeline.set_params(clf = RandomForestClassifier())
parameters_rf = {'clf__criterion':['gini','entropy'], 'clf__n_estimators':[10,20,30], 'clf__warm_start':[True,False]}
gs_clf_rf = GridSearchCV(final_pipeline, param_grid=parameters_rf, n_jobs=-1,verbose=1)
gs_clf_rf = gs_clf_rf.fit(X_train, y_train)
print("Best Score: ", gs_clf_rf.best_score_)
print("\nBest Parameters: ", gs_clf_rf.best_params_)

y_pred_rf = gs_clf_rf.predict(X_test)
print(classification_report(y_test,y_pred_rf))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  3.6min finished


Best Score:  0.8028878844846206

Best Parameters:  {'clf__criterion': 'entropy', 'clf__n_estimators': 30, 'clf__warm_start': False}
              precision    recall  f1-score   support

           0       0.79      0.84      0.82     12500
           1       0.83      0.78      0.80     12500

   micro avg       0.81      0.81      0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000



## Gradient Boosting Classifier

In [16]:
final_pipeline.set_params(clf = GradientBoostingClassifier())
parameters_gbc = {'clf__n_estimators':[10,20,30], 'clf__learning_rate':[0.1,0.05,1]}
gs_clf_gbc = GridSearchCV(final_pipeline, param_grid=parameters_gbc, n_jobs=-1,verbose=1)
gs_clf_gbc = gs_clf_gbc.fit(X_train, y_train)
print("Best Score: ", gs_clf_gbc.best_score_)
print("\nBest Parameters: ", gs_clf_gbc.best_params_)

y_pred_gbc = gs_clf_gbc.predict(X_test)
print(classification_report(y_test,y_pred_gbc))

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  3.3min finished


Best Score:  0.8088476460941563

Best Parameters:  {'clf__learning_rate': 1, 'clf__n_estimators': 30}
              precision    recall  f1-score   support

           0       0.82      0.80      0.81     12500
           1       0.81      0.83      0.82     12500

   micro avg       0.81      0.81      0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

