# Import necessary dependencies

In [2]:
# Import required Libraries
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

import nltk
nltk.download('stopwords')

nlp = spacy.load('en', parse = False, tag=False, entity=False)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Load train Data set
df = pd.read_csv("/content/sentiment.csv", error_bad_lines=False, sep='\t')
#df.head()

df.shape

(25000, 3)

In [4]:
# load test data
df_ts = pd.read_csv("/content/sentiment_tst.csv", error_bad_lines=False, sep='\t')

# Cleaning Text - strip HTML

In [5]:
# function to remove html code in the text rwas data
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")#.get_text()
    stripped_text = soup.get_text()
    return stripped_text

# Removing accented characters

In [6]:
# Function bring the text to normal string format
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# Expanding Contractions

In [7]:
# Function text data contain word like don't, does'nt, so convert them to do not, does not
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Removing Special Characters

In [8]:
# Function remove special character other then alphabet and number
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

# Lemmatizing text

In [9]:
# Function bring the pural, abjective word to root form.
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

# Removing Stopwords

In [10]:
# Function to remove stopword using NLTK Libraries
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# Normalize text corpus - tying it all together

In [11]:
# Combining all above function in above function in one and carry the Text cleaning data
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters    
        if special_char_removal:
            doc = remove_special_characters(doc)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus


# Model predictions of movie review

In [12]:
# Import Scikit Learn Libraries for model prediction.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.metrics import roc_curve, auc 

from sklearn import metrics

In [13]:
# Run the Cleaning process function of text data in train data set
df['cleaned_re'] = normalize_corpus(df['review'])

In [14]:
# Run the Cleaning process function of text data in test data set
df_ts['cleaned_re'] = normalize_corpus(df_ts['review'])

In [15]:
df_ts.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,cleaned_re
0,0,The Violent Men is a good western. Perhaps the...,positive,violent man good western perhaps story not ori...
1,1,QUESTION: How does a film merit two different ...,negative,question film merit two different title like l...
2,2,The title pretty much lets you know what you'r...,negative,title pretty much let know get grade c howler ...
3,3,This could have been the best game ever!! But ...,negative,could good game ever game maker screw 3 assass...
4,4,Oliver Stone is not one to shy away from a mov...,positive,oliver stone not one shy away movie theme matt...


In [16]:
# take a peek at the data
reviews = np.array(df['cleaned_re'])
sentiments = np.array(df['sentiment'])

reviews_ts = np.array(df_ts['cleaned_re'])
sentiments_ts = np.array(df_ts['sentiment'])

# build train and test datasets
norm_train_reviews = reviews
train_sentiments = sentiments
norm_test_reviews = reviews_ts
test_sentiments = sentiments_ts

# normalize datasets
#norm_train_reviews = tn.normalize_corpus(train_reviews)
#norm_test_reviews = tn.normalize_corpus(test_reviews)

In [17]:
# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

In [18]:
# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [19]:
def train_predict_model(classifier, 
                        train_features, train_labels, 
                        test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions 

In [20]:
# Run the metrics function to display the performance of predictive modeling 
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    print(report)

In [21]:
# Run the metrics function to display the performance of predictive modeling
def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  codes=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                codes=level_labels)) 
    print(cm_frame) 

In [22]:
# Run the metrics function to display the performance of predictive modeling 
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, 
                                  classes=classes)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, 
                             classes=classes)

In [23]:
# Run the metrics function to display the performance of predictive modeling 
def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        4))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))

In [24]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (25000, 1627917)  Test features shape: (25000, 1627917)
TFIDF model:> Train features shape: (25000, 1627917)  Test features shape: (25000, 1627917)


### Model Training, Prediction and Performance Evaluation

In [25]:
# Predictive modeling
from sklearn.linear_model import SGDClassifier, LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1)
svm = SGDClassifier(loss='hinge', max_iter=100)

In [27]:
# Logistic Regression model on BOW features
# Please Note : the module meu is not been provided. 
lr_bow_predictions = train_predict_model(classifier=lr, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Performance metrics:
------------------------------
Accuracy: 0.8887
Precision: 0.8887
Recall: 0.8887
F1 Score: 0.8887

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.89      0.89     12500
    negative       0.89      0.89      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive      11109     1391
        negative       1391    11109


In [28]:
# Logistic Regression model on TF-IDF features
# Please Note : the module meu is not been provided.
lr_tfidf_predictions = train_predict_model(classifier=lr, 
                                               train_features=tv_train_features, train_labels=train_sentiments,
                                               test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_tfidf_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8856
Precision: 0.8857
Recall: 0.8856
F1 Score: 0.8856

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.88      0.89      0.89     12500
    negative       0.89      0.88      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive      11118     1382
        negative       1477    11023


In [29]:
# SVM model on BOW features
# Please Note : the module meu is not been provided.
svm_bow_predictions = train_predict_model(classifier=svm, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_bow_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8786
Precision: 0.8787
Recall: 0.8786
F1 Score: 0.8785

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.87      0.88     12500
    negative       0.87      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive      10863     1637
        negative       1399    11101


In [30]:
# SVM model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=svm, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8927
Precision: 0.8928
Recall: 0.8927
F1 Score: 0.8927

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.90      0.89     12500
    negative       0.90      0.88      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive      11269     1231
        negative       1452    11048


In [32]:
! pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/20/37/bc4e0ddc30c07a96482abf1de7ed1ca54e59bba2026a33bca6d2ef286e5b/catboost-0.24.4-cp36-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.8MB 69kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [33]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [40]:
randomclassifier=RandomForestClassifier(n_estimators=300,criterion='entropy')

In [41]:
# Random Forest model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = train_predict_model(classifier=randomclassifier, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8674
Precision: 0.8679
Recall: 0.8674
F1 Score: 0.8674

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.88      0.85      0.87     12500
    negative       0.86      0.88      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive      10630     1870
        negative       1444    11056


In [42]:
# Randome Forest model on BOW features
# Please Note : the module meu is not been provided.
random_bow_predictions = train_predict_model(classifier=randomclassifier, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=random_bow_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8699
Precision: 0.87
Recall: 0.8699
F1 Score: 0.8699

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.86      0.88      0.87     12500
    negative       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive      10985     1515
        negative       1737    10763
