# Final Project
---------------------------------------------------------------------
## CIS 600 Fundamental Data & Knowledge Mining
## Prof. Ying Lin
## 11/8/2022

### Anthony Redamonti, Dana Dippery, Joshua, Hal Baird
### Syracuse University

### Introduction 
The following project was implemented in Jupyter Notebooks using Anaconda 3 with Python 3 (ipykernel). The goal of the project is below.

Analyze the dataset provided in the file "SMSSpamCollection."
- Data Preparation & EDA
- Build, Tune, and Evaluate Decision Tree Models

Apply the classification models on the test dataset.
- Model Prediction

### Section 1: Data Preparation
- The data must be split into different dataframes:
    - Dataframe containing all Spam messages.
    - Dataframe containing all Ham messages.
    - Dataframe containing all Ham and Spam messages.
    - Dataframe containing the summarized data of Ham and Spam (two entries).

In [1]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

data = pd.read_csv("SMSSpamCollection", sep="\t", names=["label", "message"])
print(data.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aredamonti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aredamonti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aredamonti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aredamonti\AppData\Roaming\nltk_data...


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
def preprocess(document, stem=True):
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]
    return words

In [4]:
message_set = list()
# for row in data.iterrows(): # this doesn't work right?!
for i in range(len(data)):
    label = data.iloc[i][0]
    message = data.iloc[i][1]
    words_filtered = [e.lower() for e in preprocess(message, stem=False) if len(e) >= 3]
    message_set.append([words_filtered, label])

# Feature Creation

In [5]:
def get_words_in_messages(messages_and_labels):
    all_words = list()
    for (message, label) in messages_and_labels:
        for word in message:
            all_words.append(word)
    return all_words

In [6]:
def get_word_features(word_list):
    word_list = nltk.FreqDist(word_list)
    word_features = word_list.keys()
    return word_features

In [7]:
def get_labels(messages_and_labels):
    all_labels = list()
    for (message, label) in messages_and_labels:
        all_labels.append(label)
    return all_labels

In [8]:
def get_messages(messages_and_labels):
    all_message = list()
    for (message, label) in messages_and_labels:
        all_message.append(message)
    return all_message

In [9]:
word_features = get_word_features(get_words_in_messages(message_set))
labels = get_labels(message_set)
messages = get_messages(message_set)
print(len(word_features))

8003


# Training and Testing

In [10]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

In [11]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [12]:
scores =[]
sentences = data['message']

for sentence in sentences:
    score = analyser.polarity_scores(sentence)
    scores.append(score)

In [13]:
#Converting List of Dictionaries into Dataframe
scoresDf= pd.DataFrame(scores)
print(scoresDf.columns)

Index(['neg', 'neu', 'pos', 'compound'], dtype='object')


In [14]:
# df = pd.DataFrame(message_set, columns=["Message", "Label"])
df = pd.concat([pd.Series(messages), pd.Series(labels), pd.Series(scoresDf['neg']), pd.Series(scoresDf['pos']), pd.Series(scoresDf['neu']), pd.Series(scoresDf['compound'])], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df[0], df[1], test_size=0.33, random_state=42)

train_set_labeled = list()
for i in range(len(X_train)):
    train_set_labeled.append((X_train.iloc[i], y_train.iloc[i]))

test_set_labeled = list()
for i in range(len(X_test)):
    test_set_labeled.append((X_test.iloc[i], y_test.iloc[i]))

In [15]:
training_set = nltk.classify.apply_features(extract_features, train_set_labeled, labeled=True)
testing_set = nltk.classify.apply_features(extract_features, test_set_labeled, labeled=True)

In [16]:
spam_classifier = nltk.NaiveBayesClassifier.train(training_set)

In [17]:
print(nltk.classify.accuracy(spam_classifier, training_set))
print(nltk.classify.accuracy(spam_classifier, testing_set))

0.990356281810876
0.9858618814573138
