In [1]:
!pip install pandas



In [2]:
import pandas as pd
df = pd.read_csv('./DataSet/IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

## 1. Make the columns into lower case

In [4]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [5]:
df['review'] = df['review'].str.lower()

In [6]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## 2. Remove html tags 

In [7]:
import re

def remove_html_tags(text):
    pattern = re.compile(r'<.*?>')
    return pattern.sub('', text)


In [8]:
df['review'] = df['review'].apply(remove_html_tags)

In [9]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

## 3. Remove URLs

In [10]:
import re

def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [11]:
df['review'] = df['review'].apply(remove_url)

In [12]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## 4. Remove Punctuations

In [13]:
import string
import time

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
exclude = string.punctuation

### One way -> Best for Small DataSet

In [15]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [16]:
text = 'string. With. Punctuation?'

In [17]:
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1)

string With Punctuation
4.506111145019531e-05


### Second way -> Best for Big DataSet

In [18]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))


In [19]:
start = time.time()
print(remove_punc1(text))
time2 = time.time() - start
print(time1)

string With Punctuation
4.506111145019531e-05


### Apply the same on twitter hatred speech dataset

In [20]:
df1 = pd.read_csv('./DataSet/twitter_hatred_speech.csv')

In [21]:
df1.sample(5)

Unnamed: 0,id,label,tweet
7841,7842,0,suspected people :3
7726,7727,0,#flipclass gorilla simulator: you need to do...
28120,28121,0,"@user whilst you're all waiting, get down to ..."
22057,22058,0,be happy and smile. √∞¬ü¬ò¬ä #smile #alhamdulill...
16951,16952,0,happy bihday donald j. trump √¢¬Ä¬î here are so...


In [22]:
df1['tweet'] = df1['tweet'].apply(remove_punc1)

In [23]:
df1.sample(5)

Unnamed: 0,id,label,tweet
28568,28569,0,such a tune mademesmile
3555,3556,0,how to build a website for dummies 2016 des...
15702,15703,0,user user and for the lovely teachers mini fra...
10414,10415,0,shoe wall curious drinks and lunch time adid...
601,602,0,b u s y love instagood user tbt cute me b...


## 5. Remove any slang(chat word treatment)

In [24]:
chat_words = {}
with open("./DataSet/slang.txt", "r") as f:
    for line in f:
        if "=" in line:  # avoid blank lines
            key, value = line.strip().split("=", 1)
            chat_words[key.strip()] = value.strip()

In [25]:
# Second key-value pair (as a tuple)
print(list(chat_words.items())[1])

# Just the second key
print(list(chat_words.keys())[1])

# Just the second value
print(list(chat_words.values())[1])


('ADIH', 'Another Day In Hell')
ADIH
Another Day In Hell


In [26]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


In [27]:
chat_conversion('IMHO he is the best')

'In My Honest/Humble Opinion he is the best'

In [28]:
chat_conversion('FYI Delhi is the capital of India')

'For Your Information Delhi is the capital of India'

## 6. Spelling Correction

In [29]:
!pip install textblob



In [30]:
from textblob import TextBlob

In [31]:
incorrect_text = 'ceertain conditioas duriing seveal ggenerations aree moodified in the saame maner.'

textBlb = TextBlob(incorrect_text)

textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

## 7. Removing Stop Words

In [32]:
!pip install nltk



In [33]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/atharparvezce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
# Now you can use it
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [35]:
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [36]:
text1 = "This is a simple example to check if stopwords are removed properly."
print(remove_stopwords(text1))

This   simple example  check  stopwords  removed properly.


## 8. Handling Emojis

In [37]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


In [38]:
remove_emoji("Loved the movie. It was üòòüòã")


'Loved the movie. It was '

In [39]:
remove_emoji('Lmao üòÇüòÇ')

'Lmao '

In [40]:
!pip install emoji



In [41]:
import emoji

print(emoji.demojize('Python is üî•'))

Python is :fire:


In [42]:
print(emoji.demojize('Loved the movie. It was üòò'))

Loved the movie. It was :face_blowing_a_kiss:


## 9. Tokenization

### i. Using the split function

In [43]:
# word tokenization

sent1 = 'I am going to delhi'
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [44]:
# sentence tokenization

sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [45]:
# Problems with split function


sent3 = 'I am going to delhi!'
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

In [46]:
# it can only split using one character at a time

sent4 = 'Where do think I should go? I have 3 day holiday'
sent4.split('.') 

['Where do think I should go? I have 3 day holiday']

### ii. Regular Expression

In [47]:
import re

sent3 = 'I am going to delhi!'
tokens = re.findall(r"[\w']+", sent3)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [48]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sentences = re.compile('[.!?] ').split(text)
sentences

["Lorem Ipsum is simply dummy text of the printing and typesetting industry?\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

### iii. NLTK

In [49]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/atharparvezce/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/atharparvezce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [51]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [52]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [53]:
sent5 = 'I have a Ph.D in A.I'
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = 'A 5km ride cost $10.50'

In [54]:
word_tokenize(sent5)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']

In [55]:
word_tokenize(sent6)

['We',
 "'re",
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'nks',
 '@',
 'gmail.com']

In [56]:
word_tokenize(sent7)

['A', '5km', 'ride', 'cost', '$', '10.50']

### iv. spaCy

In [57]:
import spacy
print(spacy.__version__)

3.7.5


In [58]:
# 1) Create a fresh virtual env alongside your notebook
!python -m venv ~/.venvs/spacy37

# 2) Upgrade pip in that env
!~/.venvs/spacy37/bin/python -m pip install -U pip

# 3) Install a compatible set: spaCy 3.7.x + NumPy < 2.0
!~/.venvs/spacy37/bin/pip install "spacy>=3.7.2,<3.8.0" "numpy<2.0"

# 4) Download the matching English model
!~/.venvs/spacy37/bin/python -m spacy download en_core_web_sm

# 5) Register this env as a Jupyter kernel
!~/.venvs/spacy37/bin/python -m ipykernel install --user --name=spacy37 --display-name="Python (spaCy 3.7)"

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m986.6 kB/s[0m  [33m0:00:13[0m0:00:01[0m00:01[0m
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
/Users/atharparvezce/.venvs/spacy37/bin/python: No module named ipykernel


In [59]:
import spacy
print(spacy.__version__)

3.7.5


In [60]:
import spacy
nlp = spacy.load("en_core_web_sm")

sent5 = "I have a Ph.D in A.I"
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = "A 5km ride cost $10.50"

for i, doc in enumerate([nlp(sent5), nlp(sent6), nlp(sent7)], start=5):
    print(f"Tokens in sent{i}: {[t.text for t in doc]}")

Tokens in sent5: ['I', 'have', 'a', 'Ph', '.', 'D', 'in', 'A.I']
Tokens in sent6: ['We', "'re", 'here', 'to', 'help', '!', 'mail', 'us', 'at', 'nks@gmail.com']
Tokens in sent7: ['A', '5', 'km', 'ride', 'cost', '$', '10.50']


## 10. Stemming

In [61]:
from nltk.stem.porter import PorterStemmer

In [62]:
ps  =  PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])


In [64]:
sample = "walk walking walked walks"
stem_words(sample)

'walk walk walk walk'

In [72]:
text ="probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressdup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie"

                                                                                                                       
print(text)

probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressdup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie


In [73]:
stem_words(text)

'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say more like dressdup midget than children but that onli make them more fun to watch and the mother slow awaken to what happen in the world and under her own roof is believ and startl if i had a dozen thumb theyd all be up for thi movi'

## 11. Lemmatization

In [105]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

In [106]:
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = "?:!.,;"

sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
