In [1]:
from pathlib import Path
root=Path("data")
root.mkdir(exist_ok=True)
path=Path(root)/"IMDB Dataset.csv"

In [2]:
import pandas as pd
import numpy as np 

In [3]:
df = pd.read_csv(path)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# 1. lower casing

In [4]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [5]:
df['review'] = df['review'].str.lower()

In [6]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


# 2. remove HTML tags

In [7]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [8]:
text = "<html><body><p>from river to the sea Palestine will be Free </p><p> Free palestine </p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

In [9]:
remove_html_tags(text)

'from river to the sea Palestine will be Free  Free palestine  Click here to download'

In [10]:
df['review'] = df['review'].apply(remove_html_tags)

In [11]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## 3. remove urls

In [12]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [13]:
text1 = 'Check out my notebook https://www.ziza.com/mybook1/notebook8223fc1abb. i want nothing.'
text2 = 'Check out my notebook http://www.zidr.com/mybook1/notebook8223fc1abb'
text3 = 'Google search here www.google.com'
text4 = 'For notebook click https://www.free.com/mybook1/notebook8223fc1abb to search check www.google.com'

In [14]:
remove_url(text1)

'Check out my notebook  i want nothing.'

In [15]:
df['review'] = df['review'].apply(remove_url)

## 4. remove punctuation

In [16]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
exclude = string.punctuation

In [18]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [19]:
text = 'string. With. Punctuation?'

In [20]:
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1*50000)

string With Punctuation
12.242794036865234


In [21]:
# use this because this very fast
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [22]:
start = time.time()
remove_punc1(text)
time2 = time.time() - start
print(time2*50000)

6.008148193359375


In [23]:
time1/time2

2.0376984126984126

In [24]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [25]:
remove_punc1(df['review'][5])

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'

In [26]:
df['review'] = df['review'].apply(remove_punc1)

## 5. chat(short) words treatment

In [27]:
path2=Path(root)/"short words.csv"

In [28]:
chat_words=pd.read_csv(path2)
chat_words.drop(['Unnamed: 0'], axis=1, inplace=True)
chat_words

Unnamed: 0,acronym,expansion
0,2day,today
1,2m2h,too much too handle
2,2moro,tomorrow
3,2nite,tonight
4,4eae,for ever and ever
...,...,...
3352,yw,you are welcome
3353,ywca,young womens christian association
3354,ywimc,your wish is my command
3355,ywsyls,you win some you lose some


In [29]:
chat_dict = dict(zip(chat_words['acronym'].str.lower(), chat_words['expansion']))

In [30]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.lower() in chat_dict:
            new_text.append(chat_dict[w.lower()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [31]:
chat_conversion('2day is the best')

'today is the best'

In [32]:
chat_conversion('IMHO he is the best')

'in my humble opinion happy ending high explosives is the best'

In [33]:
chat_conversion('FYI delhi is the capital of india')

'for your information delhi is the capital of india'

In [34]:
df['review'] = df['review'].apply(chat_conversion)

## 6. spelling correction

In [35]:
from textblob import TextBlob

In [36]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'
incorrect_text

'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

In [37]:
textBlb = TextBlob(incorrect_text)
textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

In [38]:
def spelling_correction(text):
    textBlb = TextBlob(text)
    return textBlb.correct().string

In [39]:
start = time.time()
res=spelling_correction(df['review'][5])
time2 = time.time() - start
print(time2*50000,' sec')

61429.98933792114  sec


In [40]:
#start = time.time()
#df['review'] = df['review'].apply(spelling_correction)
#time2 = time.time() - start
#print(time2)

## 7. stop words 

#### Stop words are highâ€‘frequency words that serve mainly grammatical or structural purposes rather than conveying significant meaning.

- Articles: a, an, the
- Pronouns: I, you, he, she, it, we, they
- Prepositions: in, on, at, of, to, with
- Conjunctions: and, or, but, if
- Auxiliary verbs: is, was, were, be, have, do


In [41]:
import nltk
from nltk.corpus import stopwords
from pathlib import Path

# Create a data folder using pathlib
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

# Download stopwords into this folder
nltk.download("stopwords", download_dir=str(data_dir))

# Tell NLTK to look inside your custom folder
nltk.data.path.append(str(data_dir))

# Now you can use stopwords
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
len(stop_words)

198

In [43]:
stop_words[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [44]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stop_words:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [45]:
text='probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times'
text

"probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring. it just never gets old, despite my having seen it some 15 or more times"

In [46]:
remove_stopwords(text)

'probably  all-time favorite movie,  story  selflessness, sacrifice  dedication   noble cause,    preachy  boring.   never gets old, despite   seen   15   times'

In [47]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this wait and see a wonderful where ...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the tears in my eyes of...,positive


In [48]:
start = time.time()
df['review'].apply(remove_stopwords)
time2 = time.time() - start
print(time2,' sec')

20.313873291015625  sec


## 8. handling Emoji

In [49]:
# remove or replace

In [50]:
# to remove

In [51]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [52]:
remove_emoji("Loved the movie. It was ðŸ˜˜ðŸ˜˜")

'Loved the movie. It was '

In [53]:
remove_emoji("Lmao ðŸ˜‚ðŸ˜‚")

'Lmao '

In [54]:
# to replace

In [55]:
import emoji
print(emoji.demojize('Python is ðŸ”¥'))

Python is :fire:


In [56]:
print(emoji.demojize('Loved the movie. It was ðŸ˜˜'))

Loved the movie. It was :face_blowing_a_kiss:


In [57]:
print(emoji.distinct_emoji_list('Loved the movie. It was ðŸ˜˜'))

['ðŸ˜˜']


## 9. tokenization

#### 1. Using the split function

In [58]:
# word tokenization
sent1 = 'I am going to delhi'
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [59]:
# sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [60]:
# Problems with split function
sent3 = 'I am going to delhi!'
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

#### 2. Using Regular Expression

In [61]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+", sent3)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [62]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
text

"Lorem Ipsum is simply dummy text of the printing and typesetting industry? \nLorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."

In [63]:
sentences = re.split(r'[.!?][\n]', text)
sentences

["Lorem Ipsum is simply dummy text of the printing and typesetting industry? \nLorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

#### 3. Using NLTK

In [64]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [65]:
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

# Download punkt into this folder
nltk.download("punkt", download_dir=str(data_dir))
nltk.download("punkt_tab", download_dir=str(data_dir))

# Tell NLTK to look inside your custom folder
nltk.data.path.append(str(data_dir))

[nltk_data] Downloading package punkt to D:\data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to D:\data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [66]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [67]:
text

"Lorem Ipsum is simply dummy text of the printing and typesetting industry? \nLorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."

In [68]:
sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [69]:
sent5 = 'I have a Ph.D in A.I'
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = 'A 5km ride cost $10.50'

word_tokenize(sent5)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']

In [70]:
word_tokenize(sent6)

['We',
 "'re",
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'nks',
 '@',
 'gmail.com']

In [71]:
word_tokenize(sent7)

['A', '5km', 'ride', 'cost', '$', '10.50']

#### 4. Using Spacy

In [72]:
#!python -m spacy download en_core_web_sm

In [73]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [74]:
doc1 = nlp(sent5)
print(doc1)
doc2 = nlp(sent6)
print(doc2)
doc3 = nlp(sent7)
print(doc3)
doc4 = nlp(sent1)
print(doc4)

I have a Ph.D in A.I
We're here to help! mail us at nks@gmail.com
A 5km ride cost $10.50
I am going to visit delhi!


In [75]:
for token in doc2:
    print(token)

We
're
here
to
help
!
mail
us
at
nks@gmail.com


In [76]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

In [77]:
docs = nlp(text)
list(docs.sents)

[Lorem Ipsum is simply dummy text of the printing and typesetting industry? ,
 Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
 when an unknown printer took a galley of type and scrambled it to make a type specimen book.]

In [78]:
for token in list(docs.sents):
    print(token)

Lorem Ipsum is simply dummy text of the printing and typesetting industry? 

Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book.


#### 10. Stemming or Lemmatization

`Inflection` is the process by which words change their form to indicate grammatical features such as tense, number, case, gender, mood, or person.

- Example: walk â†’ walked (past tense)
- Example: child â†’ children (plural form)
- Example: he sit â†’ he sits (third person singular verb form)


#### Stemming: cuts words down to their root form by chopping off prefixes/suffixes, often producing nonâ€‘dictionary words.

In [79]:
from nltk.stem.porter import PorterStemmer

In [80]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [81]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [82]:
text = 'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'
print(text)

probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie


In [83]:
stem_words(text)

'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say more like dressedup midget than children but that onli make them more fun to watch and the mother slow awaken to what happen in the world and under her own roof is believ and startl if i had a dozen thumb theyd all be up for thi movi'

#### Lemmatization: reduces words to their meaningful dictionary base form (lemma), using linguistic rules and vocabulary.

In [84]:
import nltk
from pathlib import Path

# Define your custom data folder on D: drive
data_dir = Path("D:/data")
data_dir.mkdir(exist_ok=True)

# Download WordNet into this folder
nltk.download("wordnet", download_dir=str(data_dir))
nltk.download("omw-1.4", download_dir=str(data_dir))  # optional, improves lemmatization
nltk.download("punkt", download_dir=str(data_dir))    # for tokenization

# Tell NLTK to look inside your custom folder
nltk.data.path.append(str(data_dir))

[nltk_data] Downloading package wordnet to D:\data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to D:\data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to D:\data...
[nltk_data]   Package punkt is already up-to-date!


In [85]:
# Test: use WordNetLemmatizer
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
print(wordnet_lemmatizer.lemmatize("walked", pos="v"))

walk


In [86]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [87]:
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence=remove_punc1(sentence)
sentence_words = nltk.word_tokenize(sentence)
sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
