# **Spam Detection Model**

## Mounting the Google Drive

In [1]:
from google.colab import drive
drive.mount("/gdrive")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


Loacting Dataset inside the drive

In [2]:
# This means my dataset is in the G-Drive -> My Drive -> SpamSMS.
%cd /gdrive/My Drive/SpamSMS

/gdrive/My Drive/SpamSMS


In [3]:
# So, my SpamSMS folder has below mentioned files
!ls

test_data.csv  training_data.csv


## Loading Dataset

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('training_data.csv', header= None, names=["sms", "label"])

In [6]:
# shows 6 sample lines of the data
data.sample(6)

Unnamed: 0,sms,label
3545,"Hey babe, sorry i didn't get sooner. Gary can ...",ham
1140,Dear :-/ why you mood off. I cant drive so i b...,ham
697,"8 at the latest, g's still there if you can sc...",ham
2425,Company is very good.environment is terrific a...,ham
480,Dear Subscriber ur draw 4 £100 gift voucher wi...,spam
3170,So u gonna get deus ex?,ham


## PRE-PROCESSING

In [7]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

print(stopwords[:6])
print(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [8]:
#Pre-Processing and removing unnecesssary Punctuations

def pre_process(sms):
  remove_punct = "".join([word.lower() for word in sms if word not in punctuation])
  tokenize = nltk.tokenize.word_tokenize(remove_punct)
  remove_stopwords = [word for word in tokenize if word not in stopwords]
  return remove_stopwords

data['processed'] = data['sms'].apply(lambda x: pre_process(x))
data.head()

Unnamed: 0,sms,label,processed
0,msg_text,ham_or_spam,[msgtext]
1,"Haha better late than ever, any way I could sw...",ham,"[haha, better, late, ever, way, could, swing]"
2,Thanks a lot for your wishes on my birthday. T...,ham,"[thanks, lot, wishes, birthday, thanks, making..."
3,A guy who gets used but is too dumb to realize...,ham,"[guy, gets, used, dumb, realize]"
4,Talk sexy!! Make new friends or fall in love i...,spam,"[talk, sexy, make, new, friends, fall, love, w..."


## Seperating Ham/Spam Words

In [9]:
def categorize_words():
    spam_words = []
    ham_words = []
#dealing messages associated with spam
    for sms in data['processed'][data['label'] == 'spam']:
        for word in sms:
            spam_words.append(word)
#dealing messages associated with ham
    for sms in data['processed'][data['label'] == 'ham']:
        for word in sms:
            ham_words.append(word)
    return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:8])
print(ham_words[:8])

['talk', 'sexy', 'make', 'new', 'friends', 'fall', 'love', 'worlds']
['haha', 'better', 'late', 'ever', 'way', 'could', 'swing', 'thanks']


## Using Predict Function

In [10]:
def predict(sms):
  spam_counter = 0
  ham_counter = 0
#count the occurances of each word in the sms string
  for word in sms:
    spam_counter += spam_words.count(word)
    ham_counter += ham_words.count(word)
  print('\n******RESULT******\n')
#if the text message is ham
  if ham_counter > spam_counter:
    accuracy =round((ham_counter/(ham_counter + spam_counter)) * 100,2)
    print('messege is not spam, with {}% assurance'.format(accuracy))
#if the text message could be spam and ham
  elif ham_counter == spam_counter:
    print('message could be spam')
#if the text message is spam
  else:
    accuracy = round((spam_counter / (ham_counter + spam_counter))* 100,2)
    print('message is spam, with {}% assurance'.format(accuracy))

## Taking the User Input

In [11]:
# 1. Message Testing

user_input = input("Please type a spam or ham message\n")
#pre-processing the input before prediction
processed_input = pre_process(user_input)

predict(processed_input)

Please type a spam or ham message
Hello, How are you?

******RESULT******

messege is not spam, with 95.35% assurance
