# **Step 1: Importing Libraries**

**Pandas** 

Open source data analysis and manipulation tool.

**NumPy** 

High performance numerical computation library inspired by MatLab and Native C.

In [8]:
# Data manipulation
import pandas as pd
import numpy as np
import re
import string

**NLTK (Natural Language Toolkit)** 

A set of libraries and programs used for symbolic and statistical natural language processing of English.

**Scikit-learn** 

Machine Learning library that has various classification, regression and cluster algorithms.

In [9]:
# Importing libraries
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [26]:
# Machine learning
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# **Step 2: Stopwords**

**Stopwords**

A set of commonly used words in a language (determiners, coordinate conjunctions, prepositions. etc).

These word are filtered out before or after processing natural language data. This allows us to focus on the important words.

Example: There is **an** umbrella.

Popular Stopword Lists:

1. [Terrier Stopword List](https://github.com/kavgan/stop-words/blob/master/terrier-stop.txt)

2. [Snowball Stopword List](http://snowball.tartarus.org/algorithms/english/stop.txt)

3. [Stopword Lists for 19 Languages](https://www.kaggle.com/rtatman/stopword-lists-for-19-languages)




In [12]:
# Stopwords (using NLTK for list)
stopword_list = set(stopwords.words('english'))



# **Step 3: Loading Dataset**

[**IMDB Dataset of 50K Movie Reviews**](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) 

Columns:


1.   Review (String)
2.   Sentiment (String)

**load_dataset**

Function that reads dataset to build a dataframe.

Parameters:

*   filepath: path to the dataset (CVS file).
*   cols: columns of the dataset.

Return:
*   df: dataframe from dataset.

In [13]:
def load_dataset(filepath, cols):
    df = pd.read_csv(filepath, encoding = 'latin-1')
    df.columns = cols 
    return df

# **Step 4: Preprocessing the Textual Data**



 **Casing**

 Converting the textual data to either all upper or lower case.

 **Noise Removal** 

 **Tokenization**

 **Stopword Removal**

 **Text Normalization**
  *   Stemming
  *   Lemmatization





In [14]:
# Function to do all the preprocesssing
def preprocess_review(review):

    # Casing
    review = review.lower()

    # Remove URLs
    review = re.sub(r"http\S+|www\S+|https\S+", "", review, flags = re.MULTILINE)

    # Removing punctuation and other symbols
    review = review.translate(str.maketrans("", "", string.punctuation))
    review = re.sub(r'\@\w+|\#', "", review)

    # Remove stopwords
    review_tokens = word_tokenize(review)
    filtered_words = [word for word in review_tokens if word not in stopword_list]

    # Stemming
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]

    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos = 'a') for w in stemmed_words]

    return " ".join(lemma_words)


cleaned_text = lambda x: preprocess_review(x)


preprocess_review("Hello, this is a Test Line #@ ...")

'hello test line'

# **Step 5: Vectorization of Textual Data**

---



In [15]:
# Function to vectorize our data
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf = True)
    vector.fit(train_fit)
    return vector

# **Step 6: Uploading the Dataset**

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import io
dataset = pd.read_csv('IMDB Dataset.csv')
dataset.shape

(50000, 2)

In [18]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [20]:
dataset['cleaned_review'] = pd.DataFrame(dataset.review.apply(cleaned_text))

In [21]:
dataset.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch 1 oz episod youll hoo...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product br br film techniqu unass...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


# **The Model**

In [22]:
review = dataset.cleaned_review
sentiment = dataset.sentiment

IV_train, IV_test, DV_train, DV_test = train_test_split(review, sentiment, test_size = 0.1, random_state = 5)

print('IV_train :', len(IV_train))
print('IV_test :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test :', len(DV_test))

IV_train : 45000
IV_test : 5000
DV_train : 45000
DV_test : 5000


In [24]:
tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = 'lbfgs')

model = Pipeline([('vectorizer', tvec), ('classifier', clf2)])
model.fit(IV_train, DV_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, inter

In [25]:
predictions = model.predict(IV_test)
confusion_matrix(predictions, DV_test)

array([[2155,  245],
       [ 289, 2311]])

In [27]:
print("Accuracy : ", accuracy_score(predictions, DV_test))
print("Percision : ", precision_score(predictions, DV_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))

Accuracy :  0.8932
Percision :  0.8933970837207278
Recall :  0.8932


# **Testing**

In [30]:
example = ["i am very unhappy after watching this"]
result = model.predict(example)

print(result)

['negative']
