# Import necessary dependencies

In [None]:
# Import required Libraries
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

import nltk
nltk.download('stopwords')

nlp = spacy.load('en', parse = False, tag=False, entity=False)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Load train Data set
df_tr = pd.read_csv("/content/sentiment.csv", error_bad_lines=False, sep='\t')
#df.head()

df_tr.shape

(25000, 3)

In [None]:
# load test data
df_ts = pd.read_csv("/content/sentiment_tst.csv", error_bad_lines=False, sep='\t')

In [None]:
df1 = pd.concat([df_tr, df_ts]).reset_index(drop=True)
df1.shape

(50000, 3)

In [None]:
df = df1[:35000]
df_ts = df1[35000:]
print(df.shape)
print(df_ts.shape)

(35000, 3)
(15000, 3)


# Cleaning Text - strip HTML

In [None]:
# function to remove html code in the text rwas data
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")#.get_text()
    stripped_text = soup.get_text()
    return stripped_text

# Removing accented characters

In [None]:
# Function bring the text to normal string format
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# Expanding Contractions

In [None]:
# Function text data contain word like don't, does'nt, so convert them to do not, does not
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Removing Special Characters

In [None]:
# Function remove special character other then alphabet and number
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

# Lemmatizing text

In [None]:
# Function bring the pural, abjective word to root form.
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

# Removing Stopwords

In [None]:
# Function to remove stopword using NLTK Libraries
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# Normalize text corpus - tying it all together

In [None]:
# Combining all above function in above function in one and carry the Text cleaning data
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters    
        if special_char_removal:
            doc = remove_special_characters(doc)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus


# Model predictions of movie review

In [None]:
# Import Scikit Learn Libraries for model prediction.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.metrics import roc_curve, auc

from sklearn import metrics

In [None]:
# Run the Cleaning process function of text data in train data set
df['cleaned_re'] = normalize_corpus(df['review'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# Run the Cleaning process function of text data in test data set
df_ts['cleaned_re'] = normalize_corpus(df_ts['review'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df_ts.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,cleaned_re
35000,10000,Worthless movie. A complete waste of time and ...,negative,worthless movie complete waste time nothing ex...
35001,10001,This crock of doodoo won a award? They must ha...,negative,crock doodoo win award must desperate give awa...
35002,10002,A traveling couple (Horton and Hamilton)stumbl...,negative,travel couple horton hamilton stumble onto tow...
35003,10003,The scientist Charles and his wife (or assista...,negative,scientist charles wife assistant marissa recei...
35004,10004,Comparisons to the original series are inevita...,negative,comparison original series inevitable shame di...


In [None]:
# take a peek at the data
reviews = np.array(df['cleaned_re'])
sentiments = np.array(df['sentiment'])

reviews_ts = np.array(df_ts['cleaned_re'])
sentiments_ts = np.array(df_ts['sentiment'])

# build train and test datasets
norm_train_reviews = reviews
train_sentiments = sentiments
norm_test_reviews = reviews_ts
test_sentiments = sentiments_ts

# normalize datasets
#norm_train_reviews = tn.normalize_corpus(train_reviews)
#norm_test_reviews = tn.normalize_corpus(test_reviews)

In [None]:
# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

In [None]:
# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [None]:
def train_predict_model(classifier, 
                        train_features, train_labels, 
                        test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions 

In [None]:
# Run the metrics function to display the performance of predictive modeling 
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    print(report)

In [None]:
# Run the metrics function to display the performance of predictive modeling
def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  codes=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                codes=level_labels)) 
    print(cm_frame) 

In [None]:
# Run the metrics function to display the performance of predictive modeling 
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, 
                                  classes=classes)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, 
                             classes=classes)

In [None]:
# Run the metrics function to display the performance of predictive modeling 
def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        4))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))

In [None]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (35000, 2122607)  Test features shape: (15000, 2122607)
TFIDF model:> Train features shape: (35000, 2122607)  Test features shape: (15000, 2122607)


### Model Training, Prediction and Performance Evaluation

In [None]:
# Predictive modeling
from sklearn.linear_model import SGDClassifier, LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1)
svm = SGDClassifier(loss='hinge', max_iter=100)

In [None]:
# Logistic Regression model on BOW features
# Please Note : the module meu is not been provided. 
lr_bow_predictions = train_predict_model(classifier=lr, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,
                                      classes=['positive', 'negative'])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Performance metrics:
------------------------------
Accuracy: 0.9003
Precision: 0.9004
Recall: 0.9003
F1 Score: 0.9003

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.91      0.90      7467
    negative       0.91      0.89      0.90      7533

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6791      676
        negative        820     6713


In [None]:
# Logistic Regression model on TF-IDF features
# Please Note : the module meu is not been provided.
lr_tfidf_predictions = train_predict_model(classifier=lr, 
                                               train_features=tv_train_features, train_labels=train_sentiments,
                                               test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_tfidf_predictions,
                                      classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.894
Precision: 0.8942
Recall: 0.894
F1 Score: 0.894

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.90      0.89      7467
    negative       0.90      0.88      0.89      7533

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6747      720
        negative        870     6663


In [None]:
# SVM model on BOW features
# Please Note : the module meu is not been provided.
svm_bow_predictions = train_predict_model(classifier=svm, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_bow_predictions,
                                      classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.8923
Precision: 0.8924
Recall: 0.8923
F1 Score: 0.8923

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.90      0.89      7467
    negative       0.90      0.88      0.89      7533

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6720      747
        negative        869     6664


In [None]:
# SVM model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=svm, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.8979
Precision: 0.8983
Recall: 0.8979
F1 Score: 0.8978

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.88      0.91      0.90      7467
    negative       0.91      0.88      0.90      7533

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6822      645
        negative        887     6646


In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
randomclassifier=RandomForestClassifier(n_estimators=300,criterion='entropy')

In [None]:
# Random Forest model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=randomclassifier, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.8765
Precision: 0.8766
Recall: 0.8765
F1 Score: 0.8765

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.88      0.87      0.88      7467
    negative       0.87      0.88      0.88      7533

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6489      978
        negative        874     6659


In [None]:
# Randome Forest model on BOW features
# Please Note : the module meu is not been provided.
random_bow_predictions = train_predict_model(classifier=randomclassifier, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=random_bow_predictions,
                                      classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.8767
Precision: 0.877
Recall: 0.8767
F1 Score: 0.8767

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.87      0.89      0.88      7467
    negative       0.89      0.86      0.88      7533

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6643      824
        negative       1026     6507


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

In [None]:
clf_tree = DecisionTreeClassifier(max_features='auto', random_state=0)
clf_svm = svm.SVC()
clf_gnb = GaussianNB()

In [None]:
# Random Forest model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=clf_tree, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.6334
Precision: 0.6334
Recall: 0.6334
F1 Score: 0.6334

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.63      0.63      0.63      7467
    negative       0.64      0.63      0.63      7533

    accuracy                           0.63     15000
   macro avg       0.63      0.63      0.63     15000
weighted avg       0.63      0.63      0.63     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       4735     2732
        negative       2767     4766


In [None]:
# Random Forest model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=clf_svm, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

Model Performance metrics:
------------------------------
Accuracy: 0.9016
Precision: 0.9018
Recall: 0.9016
F1 Score: 0.9016

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.91      0.90      7467
    negative       0.91      0.89      0.90      7533

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6817      650
        negative        826     6707


In [None]:
# Random Forest model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=clf_gnb, 
                                                train_features=tv_train_features.toarray(), train_labels=train_sentiments,
                                                test_features=tv_test_features.toarray(), test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

NameError: ignored

In [None]:
clf_svm_p = svm.SVC(kernel='sigmoid', C=1, random_state=42)


NameError: ignored

In [None]:
# Random Forest model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=clf_svm_p, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

In [None]:
from sklearn.pipeline import make_pipeline
clf_pipe = make_pipeline(lr, svm.SVC(gamma='auto'))

In [None]:
# Random Forest model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=clf_pipe, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])