In [4]:
# import the required libraries
import pandas as pd # for data manipulation

import spacy

import numpy as np # for numerical operations

import seaborn as sns # for data visualization

import matplotlib.pyplot as plt # for data visualization

import re # for regular expressions

import string, time

from textblob import TextBlob # for text processing and sentiment analysis

from nltk.corpus import stopwords # for removing stopwords  
import nltk # for text processing

import demoji # for removing emojis


In [5]:
# read in  the data from the csv file
df = pd.read_csv('../data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
# get the shape of the data
df.shape

(50000, 2)

In [9]:
# print the head of the data for the first 100 rows
df.head(100)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
95,Daniel Day-Lewis is the most versatile actor a...,positive
96,My guess would be this was originally going to...,negative
97,"Well, I like to watch bad horror B-Movies, cau...",negative
98,"This IS the worst movie I have ever seen, as w...",negative


### Lower Case
for the uniformity of each words it best to make all the reviews as lower case.


In [10]:
# get the one of the reviews, 
df['review'][3] # the third review

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [11]:
# converting the review to lower case
df['review'] = df['review'].str.lower() # convert the review to lower case

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [13]:
df['review'][3] # the third review

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

### Remove the html_tags
The reviews contains some html which will not be needed

In [14]:
# defining a function to remove the html tags
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text) # sub any pattern that matches the regular expression with an empty string

In [15]:
# playing with the remove_html_tags function
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [16]:
# pass the remove_html_tags to the apply function and call it on the review column
df['review'] = df['review'].apply(remove_html_tags)

In [17]:
# check the review column to see if the changes has been effected
df['review'][5] # the third review  

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [18]:
df['review'][:10] # the first 10 reviews

0    one of the other reviewers has mentioned that ...
1    a wonderful little production. the filming tec...
2    i thought this was a wonderful way to spend ti...
3    basically there's a family where a little boy ...
4    petter mattei's "love in the time of money" is...
5    probably my all-time favorite movie, a story o...
6    i sure would like to see a resurrection of a u...
7    this show was an amazing, fresh & innovative i...
8    encouraged by the positive comments about this...
9    if you like original gut wrenching laughter yo...
Name: review, dtype: object

### Removing URL
Another data cleaning aspect to look at on the text text is removing the url in the text.

In [19]:
# create a function to remove the special characters
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text) # sub any pattern that matches the regular expression with an empty string    

In [20]:
# experiment the remove_url function
text1 = 'Check out my youtube https://www.youtube.com/dswithbappy dswithbappy'
text2 = 'Check out my linkedin https://www.linkedin.com/in/boktiarahmed73/'
text3 = 'Google search here www.google.com'
text4 = 'For data click https://www.kaggle.com/'

# store the above texts in a list
texts = [text1, text2, text3, text4]

# loop through the texts and apply the remove_url function  
for text in texts:
    print(remove_url(text))


Check out my youtube  dswithbappy
Check out my linkedin 
Google search here 
For data click 


### Handling Puntuation
Puntuation are not neccessary and are reduncdant hence it should be removed

In [21]:
# working with string module
print(string.punctuation) # contain all punctuations i.e special characters
print(string.whitespace) # contain all white spaces
print(string.ascii_letters) # contain all alphabets

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
 	

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ


In [22]:
help(string)


Help on module string:

NAME
    string - A collection of string constants.

MODULE REFERENCE
    https://docs.python.org/3.11/library/string.html
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    Public module variables:
    
    whitespace -- a string containing all ASCII whitespace
    ascii_lowercase -- a string containing all ASCII lowercase letters
    ascii_uppercase -- a string containing all ASCII uppercase letters
    ascii_letters -- a string containing all ASCII letters
    digits -- a string containing all ASCII decimal digits
    hexdigits -- a string containing all ASCII hexadecimal digits
    octdigits -- a string containing all ASCII octal digits
    punctuation -- a string containi

In [23]:
# the code get the upper case letters in the in the variable name
name = 'Boktiar Ahmed'
for c in name:
    if c in string.ascii_uppercase:
        print(c)

B
A


In [24]:
# remove the puntuations
exclude = string.punctuation
exclude = set(exclude)
exclude

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~'}

In [25]:
# define a function to remove the punctuations
# remove the punctuations and return the text
def remove_punc(text):
    return ''.join(ch for ch in text if ch not in exclude)

In [26]:
# experiment with remove_punctuation function
text1 = 'Hello! How are you?'
text2 = 'string. With. Punctuation?'

# store the above texts in a list
texts = [text1, text2]

# loop through the texts and apply the remove_punctuation function
for text in texts:
    print(remove_punc(text))

Hello How are you
string With Punctuation


In [27]:
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
print("".join(letters)) # join the letters in the list

abcdefghijklmnopqrstuvwxyz


In [28]:
help(str.maketrans)

Help on built-in function maketrans:

maketrans(...)
    Return a translation table usable for str.translate().
    
    If there is only one argument, it must be a dictionary mapping Unicode
    ordinals (integers) or characters to Unicode ordinals, strings or None.
    Character keys will be then converted to ordinals.
    If there are two arguments, they must be strings of equal length, and
    in the resulting dictionary, each character in x will be mapped to the
    character at the same position in y. If there is a third argument, it
    must be a string, whose characters will be mapped to None in the result.



In [29]:
# checking the time
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1*50000)

string With Punctuation
50.0798225402832


In [30]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [31]:
start = time.time()
print(remove_punc1(text))
time2 = time.time() - start
print(time2*50000)

string With Punctuation
0.0


In [33]:
df['review'][5] # for the fifth review

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [36]:
# removing the punctuations from the review from the fifth review
type(df['review'][5]) # can't use apply functon on a string

str

In [37]:
remove_punc1(df['review'][5])

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'

### Handling Chat conversion
frequently use words for chatting should be handle such curtails and abridge words can be recognize.

In [38]:
chat_words = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you',
    'ILU': 'I Love You',
    'IMHO': 'In My Honest Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My A.. Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'RSVP': 'Please Reply',
    'POV': 'Point Of View',
    'ICYMI': 'In Case You Missed It',
    'TMI': 'Too Much Information',
    'FYA': 'For Your Attention',
    'FAQ': 'Frequently Asked Questions',
    'IDK': "I Don't Know",
    'GTG': 'Got To Go',
    'SMH': 'Shaking My Head',
    'TYT': 'Take Your Time',
    'WFM': 'Works For Me',
    'WTH': 'What The Heck',
    

    
}

In [39]:
# create a function to convert the chat words to their full meaning 
def chat_to_full(text):
    for key, value in chat_words.items(): # loop through the chat_words dictionary using the items method
        text = text.replace(key, value) 

In [40]:
def chat_conversion(text):
    new_text = []
    for word in text.split(): # split the text into words and store in a list
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [41]:
# example
print(chat_conversion("IDK what you are talking about"))
print(chat_conversion("LOL, I am not sure if I can make it to the party"))
print(chat_conversion("I and my friend are in a LDR"))
print(chat_conversion("I am not sure if I can make it to the party yhy"))

I Don't Know what you are talking about
LOL, I am not sure if I can make it to the party
I and my friend are in a Long Distance Relationship
I am not sure if I can make it to the party yhy


### Handling Incorrect Text
It is possible the corpus contain incorrect text and there is a need for them  to be corrected


In [42]:
incorrect_words = 'Ceertain conditionas duriing several ggenerations aree moodified in the saame maner'
textblb = TextBlob(incorrect_words)
textblb.correct().string


'Certain conditions during several generations are modified in the same manner'

### Removing Stops Words
Stop word are pronuns and articles, that don't give any semantic meaning and occurs frequently. They carry less value

In [43]:
nltk.download('stopwords') # download the stopwords from nltk   

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AbiolaLawani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
stopwords.words('english') # get the list of stopwords  

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [45]:
# get the len of the stopwords
len(stopwords.words('english'))

179

In [46]:
# define a function to remove the stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords.words('english')])  

In [47]:
# more efficient way to remove the stopwords as it removes stopword wether it is in lower or upper case
def remove_stopwords1(text):
    new_text = []
    text = text.lower() # convert the text to lower case
    
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
    return " ".join(new_text)

In [48]:
remove_stopwords1('probably My all-time favorite movie, A story Of selflessness, sacrifice and dedication TO A noble cause, BUT it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times')

'probably all-time favorite movie, story selflessness, sacrifice dedication noble cause, preachy boring. never gets old, despite seen 15 times'

In [49]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [50]:
df['review'][:100].apply(remove_stopwords1)

0     one reviewers mentioned watching 1 oz episode ...
1     wonderful little production. filming technique...
2     thought wonderful way spend time hot summer we...
3     basically there's family little boy (jake) thi...
4     petter mattei's "love time money" visually stu...
                            ...                        
95    daniel day-lewis versatile actor alive. englis...
96    guess would originally going least two parts, ...
97    well, like watch bad horror b-movies, cause th...
98    worst movie ever seen, well as, worst probably...
99    mario fan long remember, fond memories playing...
Name: review, Length: 100, dtype: object

### Removing emoji handle
It is possible that the dataset contains emoji for reaction and we might want to remove them.

In [51]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [52]:
remove_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was '

In [53]:
text = "Loved the movie. It was 😘😘"
text = demoji.replace_with_desc(text)
text




'Loved the movie. It was :face blowing a kiss::face blowing a kiss:'

In [54]:
remove_emoji("Lmao 😂😂")

'Lmao '

In [55]:
demoji.replace_with_desc("Lmao 😂😂")

'Lmao :face with tears of joy::face with tears of joy:'

### Tokenization
1. word tokenization
* word tokenization is the process of splitting a large sample of text into words 
* word tokenization is the first step in text analytics and natural language processing tasks
* One of the way to achieve word tokenization is the use of the python split function, to split the sentence into words into a list



In [189]:
# word tokenization using split method
# split method is used to split a string into an array of substrings
sent1 = "I am learning NLP, it's very interesting"  
sent1.split() # split by space


['I', 'am', 'learning', 'NLP,', "it's", 'very', 'interesting']

In [187]:

sent2 = "I am going to delhi. I will stay there for 3 days. Let's hope the trip to be great"
sent2.split('.') # split by period

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

2. Regular Expression

In [190]:
sent3 = "I am going to delhi"
tokens = re.findall("[\w']+", sent3) # find all the words in the sentence   
tokens

['I', 'am', 'going', 'to', 'delhi']

In [195]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text) # split the text into sentences
sentences

['Lorem Ipsum is simply dummy text of the printing and typesetting industry',
 "\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book",
 '']

In [192]:
sentences

['text']

In [196]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

["Lorem Ipsum is simply dummy text of the printing and typesetting industry?\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

NLTK libraries for tokenization

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt') # download the punkt tokenizer   


NameError: name 'nltk' is not defined