<h1>Project 4: SafeComm Digital Security Solutions</h1>
<p>Team Captain: E00020</p>
<p>Member 2: E00491</p>
<p>Member 3: E00045</p>
</br></br>
<h2>Table of Contents</h2>
<ol>
  <li><a href="#section1">Setup</a></li>
  <li><a href="#section2">EDA</a></li>
  <li><a href="#section3">Data Preprocessing</a></li>
  <li><a href="#section4">Models</a></li>
  <li><a href="#section5">Testing</a></li>
</ol>
</br></br>

<h3 id="section1">Setup</h3>

In [1]:
# importing all necessarylibaries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
# import spacy
import re
import string
import unicodedata
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
# from textblob import TextBlob
# from textblob import Word
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
# from wordcloud import WordCloud, STOPWORDS
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings('ignore')

# importing the csv into a pandas data frame
df = pd.read_csv("./sms.csv")


<h3 id="section2">Exploratory Data Analysis<h3>


In [7]:
#Summary of the dataset
df.head()

Unnamed: 0,Fraudolent,SMS test,ID,Date and Time
0,0,Squeeeeeze!! This is christmas hug.. If u lik ...,1EWYRBL,2017-12-02
1,0,And also I've sorta blown him off a couple tim...,ZY4PDK7,2018-03-23
2,0,Mmm thats better now i got a roast down me! i...,KLUX2C6,2016-10-29
3,0,Mm have some kanji dont eat anything heavy ok,955HXJ0,2018-04-12
4,0,So there's a ring that comes with the guys cos...,00Q6EUC,2016-08-01


In [2]:
#Summary of the dataset
#sentiment count
df['Fraudolent'].value_counts()
# We have an 

Fraudolent
0    4825
1     747
Name: count, dtype: int64

## <h3 id="section3">Data Preprocessing</h3>

### Split train and test data

In [3]:
# Divide train and test data
TRAIN_TEST_SPLIT = 5000

#train dataset
train_df=df[:TRAIN_TEST_SPLIT]

#test dataset
test_df=df[TRAIN_TEST_SPLIT:]

print(train_df.shape, test_df.shape)


(5000, 4) (572, 4)


### Normalization

In [4]:
#download the stopwords
nltk.download('stopwords')

#Tokenization of text
tokenizer=ToktokTokenizer()

#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\feder\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

#apply function on review column
train_df['SMS test']=train_df['SMS test'].apply(denoise_text)

In [6]:
#Define function for removing special characters
def remove_special_characters(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#apply function on review column
train_df['SMS test']=train_df['SMS test'].apply(remove_special_characters)
train_df

Unnamed: 0,Fraudolent,SMS test,ID,Date and Time
0,0,Squeeeeeze This is christmas hug If u lik my f...,1EWYRBL,2017-12-02
1,0,And also Ive sorta blown him off a couple time...,ZY4PDK7,2018-03-23
2,0,Mmm thats better now i got a roast down me id ...,KLUX2C6,2016-10-29
3,0,Mm have some kanji dont eat anything heavy ok,955HXJ0,2018-04-12
4,0,So theres a ring that comes with the guys cost...,00Q6EUC,2016-08-01
...,...,...,...,...
4995,1,FreeMsg Todays the day if you are ready Im hor...,V3ISGBJ,2018-04-27
4996,0,Jay told me already will do,4P5PPUR,2016-12-10
4997,1,08714712388 between 10am7pm Cost 10p,1S89JLP,2016-05-19
4998,0,Im good Have you registered to vote,C9CNU4L,2017-01-19


### Stemming

In [7]:
#stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

#apply function on review column
train_df['SMS test']=train_df['SMS test'].apply(simple_stemmer)
train_df

Unnamed: 0,Fraudolent,SMS test,ID,Date and Time
0,0,squeeeeez thi is christma hug if u lik my frnd...,1EWYRBL,2017-12-02
1,0,and also ive sorta blown him off a coupl time ...,ZY4PDK7,2018-03-23
2,0,mmm that better now i got a roast down me id b...,KLUX2C6,2016-10-29
3,0,mm have some kanji dont eat anyth heavi ok,955HXJ0,2018-04-12
4,0,so there a ring that come with the guy costum ...,00Q6EUC,2016-08-01
...,...,...,...,...
4995,1,freemsg today the day if you are readi im horn...,V3ISGBJ,2018-04-27
4996,0,jay told me alreadi will do,4P5PPUR,2016-12-10
4997,1,08714712388 between 10am7pm cost 10p,1S89JLP,2016-05-19
4998,0,im good have you regist to vote,C9CNU4L,2017-01-19


### Stopwords
We should evaluate if it's useful because it can eliminate looots of words in some cases

In [8]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

#apply function on review column
train_df['SMS test']=train_df['SMS test'].apply(remove_stopwords)
train_df

{'how', 'this', 'am', 'our', 'because', "don't", "won't", 'them', 'isn', 'to', "wouldn't", 'has', 'there', 's', 'won', 'same', 'that', 'we', 'again', 'of', "you'd", 'from', 'once', 'mightn', 'be', "that'll", 'yours', 'wasn', 'so', 'if', 'as', 'too', 'should', 'they', 'its', 'any', 'ourselves', 'about', 'not', 'can', 'is', 'are', 'only', 'under', 'both', "weren't", 'hadn', 'where', 'theirs', 'himself', 'had', 'until', 'it', 'needn', 'each', "you're", 'didn', 'an', 'her', 'you', 'what', 'will', 'very', 'nor', 'out', "shouldn't", 'aren', 'between', 'shouldn', 'o', 'weren', 'with', 'when', 'themselves', 'and', 'off', 'against', 'most', 'mustn', 'd', 'having', 'just', 'by', 'shan', 'at', 'all', 'down', 'she', 'before', 'ain', 'which', 'who', 'on', 'some', "doesn't", 'then', 'herself', "you've", 'after', 'than', 'were', "mightn't", 'haven', 'more', 'myself', 'your', 'further', 'a', 'he', 'hasn', "she's", 'll', 'below', 'few', 'have', 'doing', 'y', 'while', 'me', "aren't", "haven't", 'now', '

Unnamed: 0,Fraudolent,SMS test,ID,Date and Time
0,0,squeeeeez thi christma hug u lik frndshp den h...,1EWYRBL,2017-12-02
1,0,also ive sorta blown coupl time recent id rath...,ZY4PDK7,2018-03-23
2,0,mmm better got roast id b better drink 2 good ...,KLUX2C6,2016-10-29
3,0,mm kanji dont eat anyth heavi ok,955HXJ0,2018-04-12
4,0,ring come guy costum gift futur yowif hint hint,00Q6EUC,2016-08-01
...,...,...,...,...
4995,1,freemsg today day readi im horni live town lov...,V3ISGBJ,2018-04-27
4996,0,jay told alreadi,4P5PPUR,2016-12-10
4997,1,08714712388 10am7pm cost 10p,1S89JLP,2016-05-19
4998,0,im good regist vote,C9CNU4L,2017-01-19


### Convert text to numerical values

#### TF-IDF

In [17]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0.0,max_df=1,use_idf=True,ngram_range=(1,3))

#transformed train reviews
numerical_train_df=tv.fit_transform(train_df)
print(type(numerical_train_df))


<class 'scipy.sparse._csr.csr_matrix'>


In [21]:
spam_column = df["Fraudolent"]
spam_column.to_numpy

<bound method IndexOpsMixin.to_numpy of 0       0
1       0
2       0
3       0
4       0
       ..
5567    0
5568    0
5569    0
5570    0
5571    0
Name: Fraudolent, Length: 5572, dtype: int64>

## <h3 id="section4">Models</h3>

In [24]:
numerical_train_df

<4x11 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [25]:
# model 1 
# Logistic Regression
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

#Fitting the model for tfidf features
lr_tfidf=lr.fit(numerical_train_df,spam_column)

ValueError: Found input variables with inconsistent numbers of samples: [4, 5572]

In [None]:
# model 2


In [None]:
# model 3


<h3 id="section5">Testing</h3>

In [None]:
# alt run tests in seperate file?
