In [101]:
import numpy as np
import pandas as pd

## Lowercasing

In [102]:
df = pd.read_csv('IMDB Dataset.csv')

In [103]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [104]:
df.sample(5)

Unnamed: 0,review,sentiment
40208,"I don't care what some of the reviews said, th...",positive
3021,"A real let down, the novel is such a brilliant...",negative
32373,I must have seen a different version than the ...,positive
24749,With a cast of stalwart British character acto...,negative
30073,It is important not to be insulted by lack of ...,negative


In [105]:
df.shape

(50000, 2)

### Converting string to lowercase

In [106]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

### Converting all the strings into lowercase

In [107]:
df['review'] = df['review'].str.lower()

In [108]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## Remove HTML Tags

##### Removing the HTML tags using Regular Expressions

In [109]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

In [110]:
print(remove_html_tags('a wonderful little production. <br /><br />the.'))

a wonderful little production. the.


In [111]:
df['review'] = df['review'].apply(remove_html_tags)

In [112]:
df.sample(5)

Unnamed: 0,review,sentiment
25352,bad bad bad....this is another stupid movie. s...,negative
41684,"scarecrows is one of those films that, with a ...",negative
16623,i thought this would be a fun comedy with no s...,negative
40269,this movie was a littttle confusing at first. ...,negative
49575,really bad. why anyone thinks this is a good f...,negative


## Remove urls

In [113]:
urltext = 'The URL IS https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html'

In [114]:
def remove_urls(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [115]:
print(remove_urls(urltext))

The URL IS 


## Remove Punctuation

In [116]:
import string, time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [117]:
exclude = string.punctuation

In [118]:
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [119]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [120]:
text1 = 'Hi! How are you? I am fine!!!!!!'

In [121]:
start = time.time()
remove_punc(text1)
time1 = time.time()- start

In [122]:
print(time1)

3.0994415283203125e-05


In [123]:
## Takes to much time

In [124]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))
    

In [125]:
start = time.time()
remove_punc1(text1)
time2 = time.time()- start

In [126]:
time2

2.7894973754882812e-05

In [127]:
time1/time2

1.1111111111111112

## Chat word treatment

In [128]:
slang_dict = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laughter",
    "TFW": "That Feeling When",
    "MFW": "My Face When",
    "MRW": "My Reaction When",
    "IFYP": "I Feel Your Pain",
    "LOL": "Laughing Out Loud",
    "TNTL": "Trying Not To Laugh",
    "JK": "Just Kidding",
    "IDC": "I Don’t Care",
    "ILY": "I Love You",
    "IMU": "I Miss You",
    "ADIH": "Another Day In Hell",
    "IDC": "I Don’t Care",
    "ZZZ": "Sleeping, Bored, Tired",
    "WYWH": "Wish You Were Here",
    "TIME": "Tears In My Eyes",
    "BAE": "Before Anyone Else",
    "FIMH": "Forever In My Heart",
    "BSAAW": "Big Smile And A Wink",
    "BWL": "Bursting With Laughter",
    "LMAO": "Laughing My A** Off",
    "BFF": "Best Friends Forever",
    "CSL": "Can’t Stop Laughing"
}


In [129]:
def chat_converstation(text):
    new_text = []
    for w in text.split():
        if w.upper() in slang_dict:
            new_text.append(slang_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [130]:
chat_converstation('lol he is the best')

'Laughing Out Loud he is the best'

## Spelling Correction

In [131]:
from textblob import TextBlob

ModuleNotFoundError: No module named 'textblob'

In [None]:
!pip install textblob

In [132]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [133]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [134]:
remove_stopwords('Today me and my cat went out for a walk.')

'Today    cat went    walk.'

In [135]:
df['review'].apply(remove_stopwords)

0        one    reviewers  mentioned   watching  1 oz e...
1         wonderful little production.  filming techniq...
2         thought    wonderful way  spend time    hot s...
3        basically there's  family   little boy (jake) ...
4        petter mattei's "love   time  money"   visuall...
                               ...                        
49995     thought  movie    right good job.    creative...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997       catholic taught  parochial elementary schoo...
49998    i'm going    disagree   previous comment  side...
49999     one expects  star trek movies   high art,   f...
Name: review, Length: 50000, dtype: object

# Lemmatization

In [139]:
import nltk

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at the same time. He has a bad habit of swimming after playing long hours in the Sun."
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)

# Remove punctuation
sentence_words = [word for word in sentence_words if word not in punctuations]

print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))


Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
the                 the                 
same                same                
time                time                
He                  He                  
has                 have                
a                   a                   
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 


[nltk_data] Downloading package wordnet to /Users/bhuvanm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/bhuvanm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [137]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/bhuvanm/nltk_data...


True