In [46]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns


In [47]:
df=pd.read_csv("IMDB Dataset.csv")

In [48]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [49]:
df.shape

(50000, 2)

# Lower Case


In [50]:
df['review']=df['review'].str.lower()

In [51]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


# Removing of Html tags

In [52]:
# function for removing html tags
import re
def remove_html_tags(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)   


In [53]:
text = '<p>This is a <b>bold</b> paragraph with <a href="https://example.com">a link</a> and an <i>italicized</i> word.</p>\
<p>Another paragraph with <u>underlined text</u> and a <img src="image.jpg" alt="image">.</p>\
<div class="container">\
  <h1>Heading 1</h1>\
  <p>This is a paragraph inside a <code>div</code> container.</p>\
</div>'


In [54]:
text=remove_html_tags(text)

In [55]:
text

'This is a bold paragraph with a link and an italicized word.Another paragraph with underlined text and a .  Heading 1  This is a paragraph inside a div container.'

In [56]:
df['review']=df['review'].apply(remove_html_tags)

In [57]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Remove URL

In [58]:
def remove_url(text):
    pattern=re.compile('https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^ \n]*)?'
    )
    return pattern.sub(r'',text)
    

  pattern=re.compile('https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^ \n]*)?'


In [59]:
text='''Check out these amazing websites:

1. Visit our official site at https://www.example.com for more details.
2. For tech news, head over to http://techblog.org/latest-updates.
3. If you're interested in programming tutorials, https://www.learnpython.org is a great resource.
4. Follow us on social media: https://twitter.com/ExampleOfficial and https://www.instagram.com/example_official/.
5. You can also check out the online store at http://shop.example.com/products.

Don't forget to check these out for updates and new content!
'''

In [60]:
text=remove_url(text)

In [61]:
text

"Check out these amazing websites:\n\n1. Visit our official site at  for more details.\n2. For tech news, head over to \n3. If you're interested in programming tutorials,  is a great resource.\n4. Follow us on social media:  and \n5. You can also check out the online store at .com/products.\n\nDon't forget to check these out for updates and new content!\n"

In [62]:
df['review']=df['review'].apply(remove_url)

In [63]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Punctuation Handling

In [64]:
import string
exclude=string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [65]:
def remove_punc(text):
    for char in exclude:
        text=text.replace(char,'')
    return text

In [66]:
text='''Hello, world! How are you today? I hope everything is going well... Are you ready to learn Python? Let's dive in!

Here's a list:
1. First item.
2. Second item: Introduction to Python.
3. Third item — advanced concepts.

Don't forget to check the website at https://www.example.com!

What do you think? Let's discuss it later.
'''

In [67]:
text=remove_punc(text)

In [68]:
text

'Hello world How are you today I hope everything is going well Are you ready to learn Python Lets dive in\n\nHeres a list\n1 First item\n2 Second item Introduction to Python\n3 Third item — advanced concepts\n\nDont forget to check the website at httpswwwexamplecom\n\nWhat do you think Lets discuss it later\n'

### time taken for the remove_punc function to run

In [69]:
import time
start=time.time()
print(remove_punc(text))
time1=time.time()-start
print(time1*50000)


Hello world How are you today I hope everything is going well Are you ready to learn Python Lets dive in

Heres a list
1 First item
2 Second item Introduction to Python
3 Third item — advanced concepts

Dont forget to check the website at httpswwwexamplecom

What do you think Lets discuss it later

0.0


### another function for removing punctuation

In [70]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [71]:
import time
start=time.time()
print(remove_punc1(text))
time1=time.time()-start
print(time1*50000)


Hello world How are you today I hope everything is going well Are you ready to learn Python Lets dive in

Heres a list
1 First item
2 Second item Introduction to Python
3 Third item — advanced concepts

Dont forget to check the website at httpswwwexamplecom

What do you think Lets discuss it later

82.75508880615234


In [72]:
df['review']=df['review'].apply(remove_punc1)

In [73]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Chat Conversation Handling

In [78]:
chat_words = {
    "brb": "Be right back",
    "btw": "By the way",
    "omg": "Oh my god",
    "lol": "Laugh out loud",
    "idk": "I don't know",
    "tbh": "To be honest",
    "thx": "Thanks",
    "ttyl": "Talk to you later",
    "bff": "Best friends forever",
    "lmao": "Laughing my ass off",
    "rofl": "Rolling on the floor laughing",
    "fyi": "For your information",
    "np": "No problem",
    "smh": "Shaking my head",
    "yw": "You're welcome",
    "gtg": "Got to go",
    "imo": "In my opinion",
    "l8r": "Later",
    "asap": "As soon as possible"
}

In [79]:
def chat_conversation(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [80]:
text="I need those reports asap, please."

In [84]:
chat_conversation(text)

'I need those reports asap, please.'

# Handling Spelling Mistakes

In [83]:
from textblob import TextBlob

In [86]:
incorrect_text="I am goign to teh store to by som fruits and vegeables. I need to buy an aple, some organges, and a few bannanas. It wil be a fun trip becuase I enjoy shopping for food."
textBlb=TextBlob(incorrect_text)
textBlb.correct().string

'I am going to the store to by so fruits and vegetables. I need to buy an able, some oranges, and a few banana. It will be a fun trip because I enjoy shopping for food.'

# Handling Stop Words

In [88]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [92]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on