#### Important imports

In [78]:
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [68]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashutoshawasthi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Data Loading and Preprocessing

In [69]:
df = pd.read_csv(r"Dataset\Spam.csv", encoding='windows-1252')
df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis = 1,inplace = True)
df.columns = ["label","message"]
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [70]:
lemma =  WordNetLemmatizer()
swords = stopwords.words("english")
def pre_process(text):
    tokens  = word_tokenize(text)
    tokens = [lemma.lemmatize(x.lower()) for x in tokens if x not in swords and len(x)>=2 and x not in string.punctuation and x.isalnum()]
    return " ".join(tokens)
    
    
    

In [71]:
df["clean_message"] = df["message"].apply(pre_process)

#### Bag of Words

In [77]:
cv = CountVectorizer(max_features=100)
x = cv.fit_transform(df["clean_message"])
print(x.toarray())
print(cv.get_feature_names_out())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['already' 'amp' 'and' 'are' 'ask' 'back' 'but' 'call' 'can' 'claim'
 'come' 'da' 'day' 'dear' 'do' 'dont' 'free' 'friend' 'get' 'give' 'go'
 'going' 'good' 'got' 'great' 'gt' 'happy' 'have' 'hey' 'hi' 'home' 'hope'
 'how' 'if' 'it' 'just' 'know' 'later' 'let' 'like' 'lor' 'love' 'lt'
 'make' 'message' 'min' 'mobile' 'msg' 'much' 'my' 'na' 'need' 'new'
 'night' 'no' 'now' 'number' 'oh' 'ok' 'one' 'phone' 'please' 'pls'
 'prize' 'reply' 'right' 'said' 'say' 'see' 'send' 'so' 'sorry' 'still'
 'stop' 'take' 'tell' 'text' 'the' 'thing' 'think' 'this' 'time' 'to'
 'today' 'tomorrow' 'txt' 'ur' 'wan' 'want' 'wat' 'way' 'we' 'week' 'well'
 'what' 'work' 'yeah' 'yes' 'you' 'your']


#### TFIDF 

In [80]:
tfidfv = TfidfVectorizer(max_features=200)
x= tfidfv.fit_transform(df["clean_message"])
print(x.toarray())
print(tfidfv.get_feature_names_out())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['already' 'also' 'always' 'amp' 'and' 'anything' 'are' 'around' 'ask'
 'babe' 'back' 'but' 'buy' 'ca' 'call' 'can' 'cant' 'care' 'cash' 'chat'
 'claim' 'co' 'come' 'coming' 'contact' 'could' 'customer' 'da' 'day'
 'dear' 'did' 'do' 'done' 'dont' 'dun' 'end' 'even' 'every' 'feel' 'find'
 'fine' 'first' 'for' 'free' 'friend' 'get' 'getting' 'girl' 'give' 'go'
 'going' 'gon' 'good' 'got' 'great' 'gt' 'guaranteed' 'gud' 'guy' 'haha'
 'happy' 'have' 'he' 'heart' 'help' 'hey' 'hi' 'home' 'hope' 'hour' 'how'
 'if' 'im' 'in' 'is' 'it' 'just' 'keep' 'know' 'last' 'late' 'later'
 'leave' 'let' 'life' 'like' 'lol' 'lor' 'lot' 'love' 'lt' 'make' 'many'
 'may' 'me' 'meet' 'message' 'min' 'minute' 'miss' 'mobile' 'money'
 'morning' 'msg' 'much' 'my' 'na' 'name' 'need' 'new' 'next' 'nice'
 'night' 'no' 'nokia' 'not' 'now' 'number' 'offer' 'oh' 'o