<a href="https://colab.research.google.com/github/YN1752/Spam-Classification/blob/main/Spam_classif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Dataset**

In [1]:
import pandas as pd
import string

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv('drive/My Drive/spamdata.csv')
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
data['label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

# **Data Pre-processing**

In [8]:
cleaned = data['text'][2].lower()
cleaned

"free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's"

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
punctuations = string.punctuation

In [18]:
cleaned = "".join(character for character in cleaned if character not in punctuations)
cleaned

'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s'

In [19]:
from spacy.lang.en import English
nlp = English()

In [22]:
my_doc = nlp(cleaned)

In [23]:
token_list = []
for token in my_doc:
  token_list.append(token.text)

In [30]:
token_list

['free',
 'entry',
 'in',
 '2',
 'a',
 'wkly',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005',
 'text',
 'fa',
 'to',
 '87121',
 'to',
 'receive',
 'entry',
 'questionstd',
 'txt',
 'ratetcs',
 'apply',
 '08452810075over18s']

In [32]:
from spacy.lang.en.stop_words import STOP_WORDS

In [28]:
filtered_sentence = []
for word in token_list:
  lexeme = nlp.vocab[word]
  if lexeme.is_stop == False:
    filtered_sentence.append(word)

In [31]:
filtered_sentence

['free',
 'entry',
 '2',
 'wkly',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 '2005',
 'text',
 'fa',
 '87121',
 'receive',
 'entry',
 'questionstd',
 'txt',
 'ratetcs',
 'apply',
 '08452810075over18s']

In [34]:
cleaned = " ".join(filtered_sentence)
cleaned

'free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s'

In [35]:
def clean_text(text):

  cleaned = text.lower()

  punctuations = string.punctuation
  cleaned = "".join(character for character in cleaned if character not in punctuations)

  my_doc = nlp(cleaned)
  token_list = []
  for token in my_doc:
    token_list.append(token.text)

  filtered_sentence = []
  for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
      filtered_sentence.append(word)
  
  cleaned = " ".join(filtered_sentence)

  return cleaned


In [40]:
data['cleaned'] = data['text'].apply(clean_text)
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


# **Feature Engineering**

In [56]:
data['word_count'] = data['text'].apply(lambda x: len(x.split()))
data['word_count_cleaned'] = data['cleaned'].apply(lambda x: len(x.split()))
data['char_count'] = data['cleaned'].apply(lambda x: len(x))
data['char_count_no_space'] = data['cleaned'].apply(lambda x: len(x.replace(" ","")))
data['num_count'] = data['cleaned'].apply(lambda x: len([w for w in x.split() if w.isdigit()]))

In [57]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_no_space,num_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


In [58]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [59]:
doc = nlp(data['cleaned'][2])
doc

free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s

In [62]:
all_tags = []
for w in doc:
  all_tags.append(w.tag_)

In [63]:
all_tags

['JJ',
 'NN',
 'CD',
 'JJ',
 'NN',
 'NN',
 'NNP',
 'NNP',
 'JJ',
 'NN',
 'NN',
 'CD',
 'NN',
 'NN',
 'CD',
 'VBP',
 'NN',
 'FW',
 'NN',
 'NN',
 'VBP',
 'CD']

In [72]:
pos_dic = { 'noun':['NN','NNS','NNP','NNPS'], 'verb':['VB','VBD','VBG','VBN','VBZ','VBP']}

In [73]:
count_n = 0
count_v = 0

for tag in all_tags:
  if tag in pos_dic['noun']:
    count_n+=1;
  elif tag in pos_dic['verb']:
    count_v+=1;

print('Noun: ',count_n)
print('Verb: ',count_v)

Noun:  12
Verb:  2


In [74]:
def pos_check(text, family):

  doc = nlp(text)

  all_tags = []
  for w in doc:
    all_tags.append(w.tag_)

  count = 0

  for tag in all_tags:
    if tag in pos_dic[family]:
      count+=1;
    
  return count

In [75]:
data['noun_count'] = data['cleaned'].apply(lambda x: pos_check(x,'noun'))
data['verb_count'] = data['cleaned'].apply(lambda x: pos_check(x,'verb'))
data

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_no_space,num_count,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0,10,2
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0,3,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3,12,2
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0,6,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0,1,2
...,...,...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u won å£750 pound p...,30,19,107,89,3,12,2
5568,ham,Will Ì_ b going to esplanade fr home?,ì b going esplanade fr home,8,6,27,22,0,4,1
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestions,10,4,29,24,0,3,0
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like d interested buying we...,26,10,58,49,0,4,2


In [82]:
from sklearn.preprocessing import LabelEncoder

target = data['label'].values
target = LabelEncoder().fit_transform(target)

In [84]:
features = data.iloc[:,3:]

In [85]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(features, target, random_state=20, stratify=target)

In [87]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, Y_train)

In [None]:
pred = 