In [60]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns


In [61]:
df=pd.read_csv("IMDB Dataset.csv")

In [62]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [63]:
df.shape

(50000, 2)

# Lower Case


In [64]:
df['review']=df['review'].str.lower()

In [65]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


# Removing of Html tags

In [66]:
# function for removing html tags
import re
def remove_html_tags(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)   


In [67]:
text = '<p>This is a <b>bold</b> paragraph with <a href="https://example.com">a link</a> and an <i>italicized</i> word.</p>\
<p>Another paragraph with <u>underlined text</u> and a <img src="image.jpg" alt="image">.</p>\
<div class="container">\
  <h1>Heading 1</h1>\
  <p>This is a paragraph inside a <code>div</code> container.</p>\
</div>'


In [68]:
text=remove_html_tags(text)

In [69]:
text

'This is a bold paragraph with a link and an italicized word.Another paragraph with underlined text and a .  Heading 1  This is a paragraph inside a div container.'

In [70]:
df['review']=df['review'].apply(remove_html_tags)

In [71]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Remove URL

In [72]:
def remove_url(text):
    pattern=re.compile('https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^ \n]*)?'
    )
    return pattern.sub(r'',text)
    

  pattern=re.compile('https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^ \n]*)?'


In [73]:
text='''Check out these amazing websites:

1. Visit our official site at https://www.example.com for more details.
2. For tech news, head over to http://techblog.org/latest-updates.
3. If you're interested in programming tutorials, https://www.learnpython.org is a great resource.
4. Follow us on social media: https://twitter.com/ExampleOfficial and https://www.instagram.com/example_official/.
5. You can also check out the online store at http://shop.example.com/products.

Don't forget to check these out for updates and new content!
'''

In [74]:
text=remove_url(text)

In [75]:
text

"Check out these amazing websites:\n\n1. Visit our official site at  for more details.\n2. For tech news, head over to \n3. If you're interested in programming tutorials,  is a great resource.\n4. Follow us on social media:  and \n5. You can also check out the online store at .com/products.\n\nDon't forget to check these out for updates and new content!\n"

In [76]:
df['review']=df['review'].apply(remove_url)

In [77]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Punctuation Handling

In [78]:
import string
exclude=string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [79]:
def remove_punc(text):
    for char in exclude:
        text=text.replace(char,'')
    return text

In [80]:
text='''Hello, world! How are you today? I hope everything is going well... Are you ready to learn Python? Let's dive in!

Here's a list:
1. First item.
2. Second item: Introduction to Python.
3. Third item — advanced concepts.

Don't forget to check the website at https://www.example.com!

What do you think? Let's discuss it later.
'''

In [81]:
text=remove_punc(text)

In [82]:
text

'Hello world How are you today I hope everything is going well Are you ready to learn Python Lets dive in\n\nHeres a list\n1 First item\n2 Second item Introduction to Python\n3 Third item — advanced concepts\n\nDont forget to check the website at httpswwwexamplecom\n\nWhat do you think Lets discuss it later\n'

### time taken for the remove_punc function to run

In [83]:
import time
start=time.time()
print(remove_punc(text))
time1=time.time()-start
print(time1*50000)


Hello world How are you today I hope everything is going well Are you ready to learn Python Lets dive in

Heres a list
1 First item
2 Second item Introduction to Python
3 Third item — advanced concepts

Dont forget to check the website at httpswwwexamplecom

What do you think Lets discuss it later

0.0


### another function for removing punctuation

In [84]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [85]:
import time
start=time.time()
print(remove_punc1(text))
time1=time.time()-start
print(time1*50000)


Hello world How are you today I hope everything is going well Are you ready to learn Python Lets dive in

Heres a list
1 First item
2 Second item Introduction to Python
3 Third item — advanced concepts

Dont forget to check the website at httpswwwexamplecom

What do you think Lets discuss it later

0.0


In [86]:
df['review']=df['review'].apply(remove_punc1)

In [87]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

# Chat Conversation Handling

In [88]:
chat_words = {
    "brb": "Be right back",
    "btw": "By the way",
    "omg": "Oh my god",
    "lol": "Laugh out loud",
    "idk": "I don't know",
    "tbh": "To be honest",
    "thx": "Thanks",
    "ttyl": "Talk to you later",
    "bff": "Best friends forever",
    "lmao": "Laughing my ass off",
    "rofl": "Rolling on the floor laughing",
    "fyi": "For your information",
    "np": "No problem",
    "smh": "Shaking my head",
    "yw": "You're welcome",
    "gtg": "Got to go",
    "imo": "In my opinion",
    "l8r": "Later",
    "asap": "As soon as possible"
}

In [89]:
def chat_conversation(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [90]:
text="I need those reports asap, please."

In [91]:
chat_conversation(text)

'I need those reports asap, please.'

# Handling Spelling Mistakes

In [92]:
from textblob import TextBlob

In [93]:
incorrect_text="I am goign to teh store to by som fruits and vegeables. I need to buy an aple, some organges, and a few bannanas. It wil be a fun trip becuase I enjoy shopping for food."
textBlb=TextBlob(incorrect_text)
textBlb.correct().string

'I am going to the store to by so fruits and vegetables. I need to buy an able, some oranges, and a few banana. It will be a fun trip because I enjoy shopping for food.'

In [94]:
df['review']=df['review'].apply(chat_conversation)

# Handling Stop Words

A corpus (plural: corpora) refers to a large collection of written or spoken texts that are used for linguistic analysis. In natural language processing (NLP) and computational linguistics, a corpus is used as a source of data to train models or to perform analysis on language patterns, syntax, semantics, and more.

In [95]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Stopwords are commonly occurring words in a language (such as "the", "is", "in", "and", etc.) that are often removed from text during processing. They don't carry significant meaning by themselves and are usually filtered out to focus on more meaningful words that carry the context of the sentence or document.

In [96]:
stop_words=stopwords.words('english')

In [97]:
def remove_stopwords(text):
    new_text = [word for word in text.split() if word.lower() not in stop_words]
    return " ".join(new_text)


In [98]:
df['review']=df['review'].apply(remove_stopwords)

In [99]:
df['review']

0        one reviewers mentioned watching 1 oz episode ...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically theres family little boy jake thinks...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary schools n...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movies high art fans exp...
Name: review, Length: 50000, dtype: object

# Handling Emojis

In [100]:
import emoji
print(emoji.demojize('Python is 🔥'))

Python is :fire:


In [101]:
df['review']=emoji.demojize(df['review'])

Or we can completely remove emojis

In [102]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile(r'[\U0001F600-\U0001F64F\U0001F3C6\U0001F389\U0001F4A1\U0001F44D\U00002764]', flags=re.UNICODE)
    return emoji_pattern.sub('', text)

In [103]:
import re

def remove_emojis(text):
    pattern = re.compile(r'[\U0001F300-\U0001FAD6]', flags=re.UNICODE)
    return pattern.sub('', text)


# Tokenization

Tokenization is the process of breaking text into smaller units, called tokens. These tokens can be words, sentences, or subwords and are useful in NLP tasks.



### word tokenization 

In [104]:
sent1='I an going to delhi'
sent1.split()

['I', 'an', 'going', 'to', 'delhi']

### Sentence Tokenization

In [105]:
sent2='I am going to Delhi. I will stay there for 3 days. Lets hope the trip to be great'
sent2.split('.')

['I am going to Delhi',
 ' I will stay there for 3 days',
 ' Lets hope the trip to be great']

The problem with split function for tokenization is that It won't make a token if there is anything other than full stop after the sentence

In [106]:
sent3="Hello! How's everything going? I'm learning NLP."
sent3=re.compile('[.!?]').split(text)
sent3


['I need those reports asap, please', '']

### Regular Expression Tokenization

In [107]:
import re

def regex_word_tokenize(text):
    pattern = r'\b\w+\b' 
    return re.findall(pattern, text)

In [108]:
text = "Hello! How's everything going? I'm learning NLP."
tokens = regex_word_tokenize(text)
tokens

['Hello', 'How', 's', 'everything', 'going', 'I', 'm', 'learning', 'NLP']

### using models for tokenization

In [109]:
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [116]:
import nltk
nltk.data.path = [r'C:\Users\hp\AppData\Roaming\nltk_data']


In [117]:
import nltk
print(nltk.data.path)


['C:\\Users\\hp\\AppData\\Roaming\\nltk_data']


In [114]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
sent1 = "Hello! How's everything going? I'm learning NLP."
tokens = tokenizer.tokenize(sent1)
print(tokens)


['Hello', '!', 'How', "'s", 'everything', 'going', '?', 'I', "'m", 'learning', 'NLP', '.']


In [118]:

from nltk.tokenize import word_tokenize

sent1 = "Hello! How's everything going? I'm learning NLP."
tokens = word_tokenize(sent1)
print(tokens)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
**********************************************************************


In [None]:
from nltk.tokenize import sent_tokenize

text = "Hello, world! This is a test."
sentences = sent_tokenize(text)
print(sentences)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\hp/nltk_data'
    - 'c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data'
    - 'c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data'
    - 'c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
**********************************************************************
