In [None]:
import zipfile

# Importing the tarfile module for working with tar files
import tarfile
# Importing all neccessary modeules
import spacy
import pandas as pd
from pathlib import Path
import sys
import re
import joblib


In [None]:
 # Mount google drive if google colab is being used
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/spam.csv'
try:
    emails_df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    try:
        emails_df = pd.read_csv(file_path, encoding='latin1')
    except UnicodeDecodeError:
        try:
            emails_df = pd.read_csv(file_path, encoding='ISO-8859-1')
        except UnicodeDecodeError:
            emails_df = pd.read_csv(file_path, encoding='cp1252')

In [None]:
# Set base folder and append custome module paths
if 'google.colab' in str(get_ipython()):
  !pip install -U nltk -qq
  !pip install -U spacy -qq
  !python -m spacy download en_core_web_sm -qq
  base_folder = Path('/content/drive/MyDrive/base_folder')
  sys.path.append('/content/drive/MyDrive/custom_functions')
else:
    base_folder = Path('/content/drive/MyDrive/base_folder')
    sys.path.append('/content/drive/MyDrive/custom_functions')

In [None]:
data_folder = base_folder/'datasets'
archive_folder = base_folder/'archive'
model_folder = base_folder/'models'

In [None]:
# Download spacy model
if 'google.colab' in str(get_ipython()):
    !python -m spacy download en_core_web_sm

In [None]:
# Read csv file into a dataframe
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Install and import the swifter module
!pip install swifter
import swifter

In [None]:
# Renaming the main columns needed and dropping unused columns
df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace = True)
df = df.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [None]:
# Get the percentage of spam observations to the percentage of ham
perc_of_spam = sum(df['label'].swifter.apply(lambda x: 1 if x == "spam" else 0))/len(df) * 100

In [None]:
print("The percentage of spam in the dataset is", perc_of_spam, "While the percentage of ham is", 100 - perc_of_spam)

In [None]:
# Import required nltk packages
import nltk
nltk.download('stopwords')  # Download the stopwords corpus
from nltk.corpus import stopwords as nltk_stopwords  # Stopwords corpus

# Import tweet tokenizer from nltk
from nltk.tokenize import TweetTokenizer

# Import CountVectorizer and TfidfVectorizer from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# Import the joblib library for saving and loading models
import joblib

# Import scikit-learn classes for building models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator

# Import the scipy library for working with sparse matrices
from scipy.sparse import csr_matrix

In [None]:
import custom_preprocessor_mod as cp
from  featurizer import ManualFeatures


In [None]:
# Change the label column into ones and zeroes (1 for spam email and 0 for non-spam)
df['label'] = df['label'].swifter.apply(lambda x: 1 if x=="spam" else 0)

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df_cleaned = cp.SpacyPreprocessor(model = "en_core_web_sm").transform(df['message'].values)

In [None]:
# save this to a file
file_df_cleaned_sparse_embed = data_folder / 'df_cleaned_sparse_embed.pkl'
joblib.dump(df_cleaned, file_df_cleaned_sparse_embed)

In [None]:
cleaned_text_df = joblib.load(file_df_cleaned_sparse_embed)

In [None]:
# Create an instance of the ManualFeatures class
featurizer = ManualFeatures(spacy_model='en_core_web_sm')

In [None]:
# Transform the X_train data into manual features
X_features_values, feature_names = featurizer.fit_transform(cleaned_text_df)

In [None]:
# Convert the numpy array into a dataframe
spam_df = pd.DataFrame(X_features_values, columns = feature_names)

In [None]:
spam_df.head()

In [None]:
# Download pyspellchecker
! pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

def check_spelling(sentence):
  misspelled = spell.unknown(re.findall(r"[\w']+|[.,!?;]", sentence))
  return len(misspelled)

In [None]:
# Convert the numpy array of text into a dataframe
spam_text = pd.DataFrame(cleaned_text_df)

In [None]:
spam_text.head()

In [None]:
# Include an additional column in the train data that shows the number of mistakes in the sentence
spam_df['no_of_spelling_mistakes'] = spam_text[0].swifter.apply(lambda x: check_spelling(x))

In [None]:
spam_df.head()

In [None]:
spam_df_combined = pd.concat((pd.DataFrame(spam_text),
                              pd.DataFrame(spam_df)), axis = 1)

In [None]:
spam_df_combined
###This is the combined dataset now

In [None]:
df.head()

In [None]:
spam_df_combined2 = pd.concat((pd.DataFrame(df),
                              pd.DataFrame(spam_df_combined)), axis = 1)

In [None]:
spam_df_combined2.head()

In [None]:
# Drop the 'message' column
spam_df_combined2 = spam_df_combined2.drop('message', axis=1)

# Rename the '0' column to 'message'
spam_df_combined2 = spam_df_combined2.rename(columns={0: 'message'})


In [None]:
spam_df_combined2.head()

In [None]:
spam_df_combined2.to_csv('spam_df_combined_with_labels.csv', index=False)


In [None]:
from google.colab import files
files.download('spam_df_combined_with_labels.csv')
