In [114]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns


In [115]:
df=pd.read_csv("IMDB Dataset.csv")

In [116]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [117]:
df.shape

(50000, 2)

# Lower Case


In [118]:
df['review']=df['review'].str.lower()

In [119]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


# Removing of Html tags

In [120]:
# function for removing html tags
import re
def remove_html_tags(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)   


In [121]:
text = '<p>This is a <b>bold</b> paragraph with <a href="https://example.com">a link</a> and an <i>italicized</i> word.</p>\
<p>Another paragraph with <u>underlined text</u> and a <img src="image.jpg" alt="image">.</p>\
<div class="container">\
  <h1>Heading 1</h1>\
  <p>This is a paragraph inside a <code>div</code> container.</p>\
</div>'


In [122]:
text=remove_html_tags(text)

In [123]:
text

'This is a bold paragraph with a link and an italicized word.Another paragraph with underlined text and a .  Heading 1  This is a paragraph inside a div container.'

In [124]:
df['review']=df['review'].apply(remove_html_tags)

In [125]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Remove URL

In [126]:
def remove_url(text):
    pattern=re.compile('https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^ \n]*)?'
    )
    return pattern.sub(r'',text)
    

  pattern=re.compile('https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^ \n]*)?'


In [127]:
text='''Check out these amazing websites:

1. Visit our official site at https://www.example.com for more details.
2. For tech news, head over to http://techblog.org/latest-updates.
3. If you're interested in programming tutorials, https://www.learnpython.org is a great resource.
4. Follow us on social media: https://twitter.com/ExampleOfficial and https://www.instagram.com/example_official/.
5. You can also check out the online store at http://shop.example.com/products.

Don't forget to check these out for updates and new content!
'''

In [128]:
text=remove_url(text)

In [129]:
text

"Check out these amazing websites:\n\n1. Visit our official site at  for more details.\n2. For tech news, head over to \n3. If you're interested in programming tutorials,  is a great resource.\n4. Follow us on social media:  and \n5. You can also check out the online store at .com/products.\n\nDon't forget to check these out for updates and new content!\n"

In [130]:
df['review']=df['review'].apply(remove_url)

In [131]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Punctuation Handling

In [132]:
import string
exclude=string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [137]:
def remove_punc(text):
    for char in exclude:
        text=text.replace(char,'')
    return text

In [138]:
text='''Hello, world! How are you today? I hope everything is going well... Are you ready to learn Python? Let's dive in!

Here's a list:
1. First item.
2. Second item: Introduction to Python.
3. Third item — advanced concepts.

Don't forget to check the website at https://www.example.com!

What do you think? Let's discuss it later.
'''

In [139]:
text=remove_punc(text)

In [140]:
text

'Hello world How are you today I hope everything is going well Are you ready to learn Python Lets dive in\n\nHeres a list\n1 First item\n2 Second item Introduction to Python\n3 Third item — advanced concepts\n\nDont forget to check the website at httpswwwexamplecom\n\nWhat do you think Lets discuss it later\n'

### time taken for the remove_punc function to run

In [146]:
import time
start=time.time()
print(remove_punc(text))
time1=time.time()-start
print(time1*50000)


Hello world Hows everything going Python
50.16326904296875


### another function for removing punctuation

In [147]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [148]:
import time
start=time.time()
print(remove_punc1(text))
time1=time.time()-start
print(time1*50000)


Hello world Hows everything going Python
0.0


In [149]:
df['review']=df['review'].apply(remove_punc1)

In [150]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Chat Conversation Handling

In [153]:
chat_words='''brb - Be right back
btw - By the way
omg - Oh my god
lol - Laugh out loud
idk - I don't know
tbh - To be honest
thx - Thanks
ttyl - Talk to you later
bff - Best friends forever
lmao - Laughing my ass off
rofl - Rolling on the floor laughing
fyi - For your information
np - No problem
smh - Shaking my head
yw - You're welcome
gtg - Got to go
imo - In my opinion
l8r - Later
brb - Be right back
asap - As soon as possible'''

In [154]:
def chat_conversation(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
        return " ".join(new_text)

In [155]:
text="I need those reports asap, please."

In [157]:
chat_conversation(text)

TypeError: string indices must be integers, not 'str'