## Loading the libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [6]:
data = pd.read_csv('Spam.csv', encoding = 'cp1252')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Observations:

1. Column names are not proper hence fix them
2. The last 3 columns contain only missing values hence delete them

In [9]:
## dropping the unnecessary columns from the data

data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
## rename the remaining columns in the data

data = data.rename({'v1' : 'Target', 'v2' : 'Message'}, axis = 1)
data.head()

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
data['Message']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

## Load the library of Natural Language processing 

In [13]:
!pip install nltk



Error processing line 3 of C:\Users\AMANT\anaconda3\lib\site-packages\googleapis_common_protos-1.56.4-py3.10-nspkg.pth:

  Traceback (most recent call last):
    File "C:\Users\AMANT\anaconda3\lib\site.py", line 169, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
    File "<frozen importlib._bootstrap>", line 562, in module_from_spec
  AttributeError: 'NoneType' object has no attribute 'loader'

Remainder of file ignored


In [16]:
import nltk
print('nltk sucessessfully imported ')

nltk sucessessfully imported 


## Basics of NLP

## Step 1 : Convert the text into a lower case 

In [18]:
text = 'It is a Truth Universally acknowledged, that a single man in possession of a good fortune, must be in want of Wife'
text

'It is a Truth Universally acknowledged, that a single man in possession of a good fortune, must be in want of Wife'

In [20]:
text = text.lower()
text

'it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of wife'

## Step 2: Removing punctuations

In [21]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [26]:
ch = []
for char in text:
    if char not in string.punctuation:
        ch.append(char)
text = "".join(ch)        
print(text)

it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of wife


## Word Tokenization

- Word Tokenization is defined as extracting all words from the text and displaying those unique words in the ouput

In [28]:
print(text)

it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of wife


In [29]:
text.unique()

AttributeError: 'str' object has no attribute 'unique'

In [42]:
## Word tokenizing

from nltk import word_tokenize
text = word_tokenize(text)      ## word_tokenize(data['column_name'])
text

['it',
 'is',
 'a',
 'truth',
 'universally',
 'acknowledged',
 'that',
 'a',
 'single',
 'man',
 'in',
 'possession',
 'of',
 'a',
 'good',
 'fortune',
 'must',
 'be',
 'in',
 'want',
 'of',
 'wife']

In [34]:
## If text is not lowered and punctions are not removed then word_tokenize will give messy data as output

text_copy = 'It is a Truth Universally acknowledged, that a single man in possession of a good fortune, must be in want of Wife'
text_copy

'It is a Truth Universally acknowledged, that a single man in possession of a good fortune, must be in want of Wife'

In [33]:
from nltk import word_tokenize
word_tokenize(text_copy)

['It',
 'is',
 'a',
 'Truth',
 'Universally',
 'acknowledged',
 ',',
 'that',
 'a',
 'single',
 'man',
 'in',
 'possession',
 'of',
 'a',
 'good',
 'fortune',
 ',',
 'must',
 'be',
 'in',
 'want',
 'of',
 'Wife']

## Sentence Tokenization

- Extracting all sentences seperately from a collection of sentences(paragraph).

In [35]:
text2 = 'This is an NLP session. NLP is a nice area of research. I am learning how to analyze text data'
text2

'This is an NLP session. NLP is a nice area of research. I am learning how to analyze text data'

In [36]:
from nltk import sent_tokenize
sent_tokenize(text2)

['This is an NLP session.',
 'NLP is a nice area of research.',
 'I am learning how to analyze text data']

## Stop word filtering

- Stop words are just additons to the sentence that make a sentence complete
eg: it, is, am, a, the, are, were, was, etc..

In [39]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [40]:
len(stop_words)

179

In [43]:
text

['it',
 'is',
 'a',
 'truth',
 'universally',
 'acknowledged',
 'that',
 'a',
 'single',
 'man',
 'in',
 'possession',
 'of',
 'a',
 'good',
 'fortune',
 'must',
 'be',
 'in',
 'want',
 'of',
 'wife']

In [50]:
text = " ".join([each_word for each_word in text if each_word not in stop_words])
text

'truth universally acknowledged single man possession good fortune must want wife'

## Frequency Count 

In [48]:
text_copy

'It is a Truth Universally acknowledged, that a single man in possession of a good fortune, must be in want of Wife'

In [49]:
nltk.FreqDist(text_copy)

FreqDist({' ': 21, 'n': 9, 's': 8, 'a': 8, 'e': 8, 'o': 8, 't': 7, 'i': 7, 'l': 4, 'f': 4, ...})

In [51]:
nltk.FreqDist(text)

FreqDist({' ': 10, 'n': 7, 'e': 7, 's': 7, 'o': 6, 't': 5, 'u': 4, 'i': 4, 'a': 4, 'l': 4, ...})

In [55]:
text = text.split()

AttributeError: 'list' object has no attribute 'split'

In [56]:
text

['truth',
 'universally',
 'acknowledged',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'must',
 'want',
 'wife']

## Parts of Speech Tagging

In [57]:
from nltk import pos_tag
pos_tag(text)

[('truth', 'NN'),
 ('universally', 'RB'),
 ('acknowledged', 'VBD'),
 ('single', 'JJ'),
 ('man', 'NN'),
 ('possession', 'NN'),
 ('good', 'JJ'),
 ('fortune', 'NN'),
 ('must', 'MD'),
 ('want', 'VB'),
 ('wife', 'NN')]

In [62]:
nltk.download('tagsets')

nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\AMANT\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


## Stemming

In [63]:
text

['truth',
 'universally',
 'acknowledged',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'must',
 'want',
 'wife']

In [65]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
porter 

<PorterStemmer>

In [66]:
[porter.stem(word) for word in text]

['truth',
 'univers',
 'acknowledg',
 'singl',
 'man',
 'possess',
 'good',
 'fortun',
 'must',
 'want',
 'wife']

- Stemming is a process that converts a word to its root form.
- The disadvantage of stemming is often the roots words obtained as output are not meaningful
- To overcome this disadvantage we perform Lemmatization
- Lemmatization is a process that converts a word to its meaningful root form.
eg: universally after Stemming will convert to 'univers' which has no meaning
but universally after lemmatization will convert to 'universe' which has got some meaning

In [70]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
lemma

<WordNetLemmatizer>

In [85]:
lemma.lemmatize('going', pos = 'v')

'go'

In [86]:
lemma.lemmatize('wanted', pos = 'v')

'want'

In [88]:
lemma.lemmatize('man', pos = 'n')

'man'

In [99]:
[lemma.lemmatize(word, pos = 'v') for word in text]

['truth',
 'universally',
 'acknowledge',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'must',
 'want',
 'wife']

In [100]:
[lemma.lemmatize(word, pos = 'n') for word in text]

['truth',
 'universally',
 'acknowledged',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'must',
 'want',
 'wife']

In [102]:
[lemma.lemmatize(word, pos = 'r') for word in text]

['truth',
 'universally',
 'acknowledged',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'must',
 'want',
 'wife']

In [103]:
[lemma.lemmatize(word, pos = 'a') for word in text]

['truth',
 'universally',
 'acknowledged',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'must',
 'want',
 'wife']

In [104]:
lemma.lemmatize('better', pos = 'a')

'good'

In [107]:
lemma.lemmatize('wrongly', pos = 'r')

'wrongly'

## Note:

- r = adverbs
- v = verbs
- a = adjective
- n = noun