In [27]:
import pandas as pd
import numpy as np
import re
import string

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

from wordcloud import WordCloud

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer

from sklearn.linear_model import Lasso
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report, plot_confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.over_sampling import SMOTE

# Observations

In [10]:
# Load Data, Initialize Dataframe
imdb_reviews_df = pd.read_csv('imdb_reviews.csv')

In [16]:
imdb_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     40000 non-null  object
 1   label    40000 non-null  int64 
 2   Emotion  40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [11]:
imdb_reviews_df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [12]:
# Add new Column with Descriptive Value of Label
label_dict = {0:'Negative', 1:'Positive'}
imdb_reviews_df['Emotion'] = imdb_reviews_df['label'].map(label_dict)

In [14]:
imdb_reviews_df.head()

Unnamed: 0,text,label,Emotion
0,I grew up (b. 1965) watching and loving the Th...,0,Negative
1,"When I put this movie in my DVD player, and sa...",0,Negative
2,Why do people who do not know what a particula...,0,Negative
3,Even though I have great interest in Biblical ...,0,Negative
4,Im a die hard Dads Army fan and nothing will e...,1,Positive


In [13]:
imdb_reviews_df['Emotion'].value_counts()

Negative    20019
Positive    19981
Name: Emotion, dtype: int64

# Text Processing

In [28]:
def text_procddessor(text):
    """
    Args:
        text(string): The text from which punctuation, stopwords are removed and lemmatized
        
    Returns:
        clean_text(string): A text formed after text preprocessing.
    """
    
    # Remove any urls from the text
    text = re.sub(r'https:\/\/.*[\r\n]*',
                  "",
                  str(text))
    
    # Remove any urls starting from www. in the text
    text = re.sub(r'www\.\w*\.\w\w\w',
                  "",
                  str(text))
    
    # Remove any html elements from the text
    text = re.sub(r"<[\w]*[\s]*/>",
                  "",
                  str(text))
    
    # Remove periods
    text = re.sub(r"[\.]*",
                  "",
                  str(text))
    
 
    # Initialize RegexpTokenizer
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokenizer = RegexpTokenizer(pattern)

    
    # Tokenize text
    text_tokens = tokenizer.tokenize(text.lower())
    
    lemmatizer  = WordNetLemmatizer()
    # Get english stopwords
    english_stopwords = stopwords.words("english")
    new_list = ["mention", "sxsw", 'link', 'rt', 'quot']
    english_stopwords.extend(new_list)
    
    cleaned_text_tokens = [] # A list to hold cleaned text tokens
    
    for word in text_tokens:
        if((word not in english_stopwords) and # Remove stopwords
            (word not in string.punctuation)): # Remove punctuation marks
                
                lemmas = lemmatizer.lemmatize(word) # Get lemma of the current word
                cleaned_text_tokens.append(lemmas) # Appened lemma word to list of cleaned list
    
    # Combine list into single string
    clean_text = " ".join(cleaned_text_tokens)
    
    return clean_text

In [29]:
imdb_reviews_df['text'] = imdb_reviews_df['text'].apply(text_processor)

In [30]:
imdb_reviews_df['text']

0        grew b watching loving thunderbird mate school...
1        put movie dvd player sat coke chip expectation...
2        people know particular time past like feel nee...
3        even though great interest biblical movie bore...
4        im die hard dad army fan nothing ever change g...
                               ...                        
39995    western union something forgotten classic west...
39996    movie incredible piece work explores every noo...
39997    wife watched movie plan visit sicily stromboli...
39998    first watched flatliners amazed necessary feat...
39999    would film good gross estimated award nominati...
Name: text, Length: 40000, dtype: object