# Final Project
---------------------------------------------------------------------
## CIS 600 Fundamental Data & Knowledge Mining
## Prof. Ying Lin
## 12/3/2022

### Anthony Redamonti, Dana Dippery, Joshua, Hal Baird
### Syracuse University

In [75]:
import os
import csv
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report #, recall_score
from mlxtend.preprocessing import DenseTransformer
from sklearn.model_selection import GridSearchCV

# Not using, should probably delete:
# import re
# import random
# import nltk
# from sklearn.naive_bayes import MultinomialNB
# from nltk.classify.scikitlearn import SklearnClassifier
# from sklearn.feature_extraction.text import TfidfTransformer

In [76]:
# First time using nltk, need to download data!
# nltk.download()

# Get Data from All Sources

In [77]:
# When processing data, check for:
# - file ends with .csv - indicates it has been humanly preprocessed
# - header removed
# - first column is label, second is message
# - labeld as ham/spam

all_data = pd.DataFrame([], columns=["label", "message"])

def process_file(fp):
    for sep in ['\t', ',']:
        for quote in [csv.QUOTE_ALL, csv.QUOTE_NONE, csv.QUOTE_MINIMAL]:
            try:
                d = pd.read_csv(fp, names=["label", "message"], sep=sep, quoting=quote)
                
                # Make sure we processed the data correctly
                # We can confirm by checking the labels... we should get exactly
                #   2 unique labels, "ham" and "spam"
                label_count = len(d["label"].unique())
                if label_count > 2:
                    raise Exception("3+ labels found")
                label_set = sorted(list(d["label"].unique()))
                if label_set != ["ham", "spam"]:
                    raise Exception("ham|spam not in labels")
                
                # Trim the pre/post space off the message, to de-duplicate
                d["message"] = d["message"].str.strip()
                
                # Return results
                return d
            
            except:
                # Ignore errors and try processing again, with different parameters
                pass
    
    # If we none of the processing parameters work, then report it!
    # All files have processed successfully, so we don't see this output.
    print(f"\tFailed: {fp}")

# Find all data files (*csv) in the the DataSets directory
# fp_all_data_out = os.path.join("DataSets", "all_data.csv")
for root, dirs, files in os.walk("DataSets"):
    for name in files:
        if name.endswith('.csv'):
            
            # Process this file
            fp = os.path.join(root, name)
            print(f"Source: {fp}")
            data = process_file(fp)
            
            # Report number of data rows this file contributes after de-duplicating
            original_count = all_data.shape[0]
            all_data = pd.concat([all_data, data], axis=0, ignore_index=True)
            all_data.drop_duplicates(inplace=True, ignore_index=True)
            print(f"\tContributed row count: {all_data.shape[0] - original_count}")

# Print summary data
print(f"Total data rows: {all_data.shape[0]}")
print(all_data.head())

Source: DataSets\httpsarchive.ics.uci.edu\ml\machine-learning-databases\00228\SMSSpamCollection.csv
	Contributed row count: 5158
Source: DataSets\www.kaggle.com_datasets\arunasivapragasam\spam-or-ham\SMSCollection.csv
	Contributed row count: 1
Source: DataSets\www.kaggle.com_datasets\assumewisely\sms-spam-collection\SMSSpamCollection.csv
	Contributed row count: 0
Source: DataSets\www.kaggle.com_datasets\hdza1991\sms-spam\sms_spam.csv
	Contributed row count: 80
Source: DataSets\www.kaggle.com_datasets\kaushikmanjunatha\dataset\SMSSpamCollection.csv
	Contributed row count: 0
Source: DataSets\www.kaggle.com_datasets\lampubhutia\email-spam-ham-prediction\sms_spam.csv
	Contributed row count: 0
Source: DataSets\www.kaggle.com_datasets\nilaychauhan\sms-spam-detection\SMSSpamCollection.csv
	Contributed row count: 0
Source: DataSets\www.kaggle.com_datasets\shravan3273\sms-spam\spamraw.csv
	Contributed row count: 645
Source: DataSets\www.kaggle.com_datasets\shrutipandit707\smsspamcollection\smss

# Prepare Data

In [78]:
df = all_data

In [79]:
def preprocess_stem(document):
    stemmer = PorterStemmer()
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)

def preprocess_lemmatize(document):
    wordnet_lemmatizer = WordNetLemmatizer()
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]
    return " ".join(words)

def preprocess_stem_and_lemmatize(document):
    stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    words = [stemmer.stem(word) for word in words]
    words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]
    return " ".join(words)

# Train and Analyze Models

In [80]:
X = df['message']
Y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [81]:
def train_test_analyze(classifier):
    '''Build, train and test a model based on the input classifier'''
    global X_train, X_test, y_train, y_test
    
    # Get classifier name
    classifier_name = type(classifier).__name__

    # Build training pipeline
    vectorizer = TfidfVectorizer(min_df= 3, sublinear_tf=True, norm='l2', ngram_range=(1, 2),
                                preprocessor=preprocess_stem)
    pipeline = Pipeline([('vect', vectorizer),
                         ('chi',  SelectKBest(chi2, k=1000)),
                         ('to_dense', DenseTransformer()),
                         ('clf', classifier)])
    model = pipeline.fit(X_train, y_train)
    
    #pipeline.steps[3][1].feature_names = list(X_train.columns.values)
    
    # Test the model
    y_preds = model.predict(X_test)
    y_test_accuracy = accuracy_score(y_test, y_preds) * 100
    
    # Output the model scoring/accuracy
    print(f"Classifier: {classifier_name}")
    print(f"Accuracy on test data: {round(y_test_accuracy, 2)}")
    print("Classification Report:")
    print(classification_report(y_test, model.predict(X_test)))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, model.predict(X_test)))
    
    return pipeline

In [82]:
gnbPipeline = train_test_analyze(GaussianNB())

Classifier: GaussianNB
Accuracy on test data: 97.69
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1577
        spam       0.90      0.89      0.90       199

    accuracy                           0.98      1776
   macro avg       0.94      0.94      0.94      1776
weighted avg       0.98      0.98      0.98      1776

Confusion Matrix:
[[1557   20]
 [  21  178]]


#### Bernoulli Naive Bayes

In [83]:
bnbPipeline = train_test_analyze(BernoulliNB())

Classifier: BernoulliNB
Accuracy on test data: 97.75
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1577
        spam       0.99      0.80      0.89       199

    accuracy                           0.98      1776
   macro avg       0.98      0.90      0.94      1776
weighted avg       0.98      0.98      0.98      1776

Confusion Matrix:
[[1576    1]
 [  39  160]]


#### Random Forest Classifier (RFC)

In [84]:
rfcPipeline = train_test_analyze(RandomForestClassifier())

Classifier: RandomForestClassifier
Accuracy on test data: 97.86
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1577
        spam       0.99      0.82      0.90       199

    accuracy                           0.98      1776
   macro avg       0.98      0.91      0.94      1776
weighted avg       0.98      0.98      0.98      1776

Confusion Matrix:
[[1575    2]
 [  36  163]]


In [86]:
#param_grid_rfc = {'n_estimators': [50, 100, 150, 200],
#             'criterion': ['gini', 'entropy'],
#             'max_features': ['sqrt', 'log2', None]}
#rfcClassifier = GridSearchCV(rfcModel, param_grid_rfc)
#rfcClassifier.fit(X_train, y_train)

# Hyperparameters for Best Performing Model
#for key, value in rfcClassifier.best_params_.items():
#    print(f"Hyperparameter: {key}; Value: {value}")

#rfcPipeline.steps[3][1].feature_importances_
#rfcPipeline.steps[3][1].get_feature_names()

#rfcPipeline.steps[3][1].fit(X_train, y_train)
feature_imp = pd.DataFrame(rfcPipeline.steps[3][1].feature_importances_, index=X_train.to_frame().rows,
columns=['importance']).sort_values('importance', ascending=False)
feature_imp

AttributeError: 'DataFrame' object has no attribute 'rows'

In [None]:
pred_rfc = rfcClassifier.best_estimator_.predict(X_test)
print(f"Accuracy: {round( metrics.accuracy_score(y_test, pred_rfc) * 100, 2)}%")

In [None]:
print("Classification Report:")
print(classification_report(y_test, rfcClassifier.best_estimator_.predict(X_test)))

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, rfcClassifier.best_estimator_.predict(X_test)))

#### K-Nearest Neighbor (KNN)

In [None]:
knnPipeline = train_test_analyze(KNeighborsClassifier())

#### Gradient Boosting Machine (GBM)

In [None]:
gbmPipeline = train_test_analyze(GradientBoostingClassifier())