In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Pre-processing & Lemmatization



In [None]:
import pandas as pd
from sklearn.utils import shuffle

In [None]:
import glob
import nltk
import re, unicodedata

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
!pip install contractions
import contractions

Collecting contractions
  Downloading contractions-0.0.55-py2.py3-none-any.whl (7.9 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 5.2 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 31.8 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85451 sha256=533fe88905b6ff009b33911bba932a57519ca94cbe1bdc300e4d918377640f4a
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully install

In [None]:
def denoise_data(content):
    content_processed = str(content)
    content_processed = BeautifulSoup(content_processed, 'html.parser').text
    content_processed = contractions.fix(content_processed)
    return content_processed

In [None]:
def normalize_data(unprocessed_content):
    content = str(unprocessed_content)

    # replace escaped sequences
    content = content.replace('\n', ' ')
    content = content.replace('\r', ' ')

    words = nltk.word_tokenize(content) 
    
    # Remove long words
    new_words = []
    for word in words:
        if len(word) <= 20:
            new_words.append(word)
    words = new_words

    # Remove Non-ASCII words
    new_words = []
    for word in words:
      new_word = unicodedata.normalize('NFKD',  word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
      new_words.append(new_word)
    words = new_words 

    # Convert to lowercase
    new_words = []
    for word in words:
      new_word = word.lower()
      new_words.append(new_word)
    words = new_words  

    # Remove punctuations
    new_words = []
    for word in words:
      new_word = re.sub(r'[^\w\s]', ' ', word)
      if new_word != '' or new_word != ' ' or new_word != '  ':
        new_words.append(new_word.strip())
    words = new_words

    # Remove numbers
    new_words = []
    for word in words:
      new_word = re.sub(r'\d+', '', word)
      if new_word != '':
        new_words.append(new_word)
    words = new_words

    # Remove stopwords
    new_words = []
    for word in words:
      if word not in nltk.corpus.stopwords.words('english'):
        new_words.append(word)
    words = new_words 

    # Lemmatize
    new_words = []
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
      lemma = lemmatizer.lemmatize(word, pos='v')
      lemmas.append(lemma)
      new_lemmas = [x.strip() for x in lemmas]
      lemmas = [x for x in new_lemmas if x != '']
    str_lemmas = ' '.join(str(x) for x in lemmas)

    return str_lemmas

# Preprocess Neethu Data

In [None]:
unprocessed_df = pd.read_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/data.csv')

In [None]:
print(unprocessed_df['Label'].unique())
unprocessed_df.head(2)

[0 1]


Unnamed: 0,Content,Label
0,"Re: New Sequences Window Date: Wed, ...",0
1,[zzzzteana] RE: AlexanderMartin A posted:\nTas...,0


In [None]:
unprocessed_df.isna().sum()

Content    0
Label      0
dtype: int64

In [None]:
unprocessed_df['Content'] = unprocessed_df['Content'].apply(normalize_data)

In [None]:
unprocessed_df.to_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/processed_data/neethu_veleon.csv')

In [None]:
neethu_data = pd.read_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/processed_data/neethu_veleon.csv')

In [None]:
# Remove duplicates
print('Duplicate count before: {}'.format(len(neethu_data[['Content', 'Label']])-len(neethu_data[['Content', 'Label']].drop_duplicates())))
neethu_data = neethu_data.drop_duplicates(subset = ['Content', 'Label'], keep = 'last').reset_index(drop = True)
print('Duplicate count after: {}'.format(len(neethu_data[['Content', 'Label']])-len(neethu_data[['Content', 'Label']].drop_duplicates())))

Duplicate count before: 176
Duplicate count after: 0


# Preprocess Neha Data

In [None]:
neha_data = pd.read_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/CSDMC2010_RAW/Neha_dataframes/subject-body-unprocessed.csv')

In [None]:
neha_data.head(2)

Unnamed: 0.1,Unnamed: 0,Content,spam_label
0,0,Anolther sequence related traceback Just got t...,1
1,1,Reg Headlines Monday July 22 Today's Headlines...,1


In [None]:
neha_data['Content'] = neha_data['Content'].apply(normalize_data)

In [None]:
neha_data.isna().sum()

Unnamed: 0    0
Content       0
spam_label    0
dtype: int64

In [None]:
neha_data = neha_data.rename(columns={'spam_label':'Label'})
neha_data = neha_data[['Content', 'Label']]

In [None]:
# Remove duplicates
print('Duplicate count before: {}'.format(len(neha_data[['Content', 'Label']])-len(neha_data[['Content', 'Label']].drop_duplicates())))
neha_data = neha_data.drop_duplicates(subset = ['Content', 'Label'], keep = 'last').reset_index(drop = True)
print('Duplicate count after: {}'.format(len(neha_data[['Content', 'Label']])-len(neha_data[['Content', 'Label']].drop_duplicates())))

Duplicate count before: 258
Duplicate count after: 0


In [None]:
neha_data.head()

Unnamed: 0,Content,Label
0,anolther sequence relate traceback get read ma...,1
1,reg headline monday july today headline regist...,1
2,spam health problems forget us content type te...,0
3,ilug instal lilo another disk tue jul pm rumou...,1
4,jm private eye html body table border align ce...,0


In [None]:
neha_data.to_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/processed_data/neha_chandraseta.csv')

In [None]:
neha_data = pd.read_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/processed_data/neha_chandraseta.csv', index_col=0)

In [None]:
neha_data.head()

Unnamed: 0,Content,Label
0,anolther sequence relate traceback get read ma...,1
1,reg headline monday july today headline regist...,1
2,spam health problems forget us content type te...,0
3,look sandy dorm html body center table bgcolor...,0
4,ilug instal lilo another disk tue jul pm rumou...,1


# Preprocess ENRON Data

In [None]:
# path to dir = '/content/drive/MyDrive/6220 Project/ENRON/data/preprocessed'
# enron_subset = 'enron1'...
def format_enron_data(path_to_dir, enron_subset):
  ham_data = pd.read_csv(f'''{path_to_dir}/{enron_subset}_ham.csv''')
  ham_data = ham_data[['Content', 'Label']]

  spam_data = pd.read_csv(f'''{path_to_dir}/{enron_subset}_spam.csv''')
  spam_data = spam_data[['Content', 'Label']]

  data = pd.concat([ham_data, spam_data])
  print(data['Label'].unique())
  data['Label'] = data['Label'].map({'ham':0, 'spam':1})
  print(data['Label'].unique())
  return data

In [None]:
formatted_dfs = []
path_to_dir = '/content/drive/MyDrive/6220 Project/ENRON/data/preprocessed'
for i in range(1, 7):
  df = format_enron_data(path_to_dir, f'enron{i}')
  formatted_dfs.append(df)
  print(len(df))
print(len(formatted_dfs))

['ham' 'spam']
[0 1]
4144
['ham' 'spam']
[0 1]
4923
['ham' 'spam']
[0 1]
4708
['ham' 'spam']
[0 1]
5042
['ham' 'spam']
[0 1]
4897
['ham' 'spam']
[0 1]
5928
6


In [None]:
enron_data = pd.concat(formatted_dfs)
print(len(enron_data))

29642


In [None]:
enron_data = shuffle(enron_data)

In [None]:
enron_data.head(5)

Unnamed: 0,Content,Label
2839,save money get oem software need software pc v...,1
918,yr fix home loan point flu dear homeowner yr f...,1
1063,contract per phone conversation reliant reques...,0
476,ilug social guarantee lose lbs days think migh...,1
64,enpower eol data october th eol deal enpower d...,0


In [None]:
# Remove duplicates
print('Duplicate count before: {}'.format(len(enron_data[['Content', 'Label']])-len(enron_data[['Content', 'Label']].drop_duplicates())))
enron_data = enron_data.drop_duplicates(subset = ['Content', 'Label'], keep = 'last').reset_index(drop = True)
print('Duplicate count after: {}'.format(len(enron_data[['Content', 'Label']])-len(enron_data[['Content', 'Label']].drop_duplicates())))

Duplicate count before: 2632
Duplicate count after: 0


In [None]:
enron_data.to_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/processed_data/enron_combined.csv')

# Combine all data

In [None]:
fully_combined_data = pd.concat([neha_data, neethu_data, enron_data])
fully_combined_data = shuffle(fully_combined_data)

In [None]:
fully_combined_data = fully_combined_data[['Content', 'Label']]

In [None]:
len(fully_combined_data)

33754

In [None]:
fully_combined_data.isna().sum()

Content    0
Label      0
dtype: int64

In [None]:
len(fully_combined_data[['Content', 'Label']])-len(fully_combined_data[['Content', 'Label']].drop_duplicates())

2

In [None]:
# Remove duplicates
print('Duplicate count before: {}'.format(len(fully_combined_data[['Content', 'Label']])-len(fully_combined_data[['Content', 'Label']].drop_duplicates())))
fully_combined_data = fully_combined_data.drop_duplicates(subset = ['Content', 'Label'], keep = 'last').reset_index(drop = True)
print('Duplicate count after: {}'.format(len(fully_combined_data[['Content', 'Label']])-len(fully_combined_data[['Content', 'Label']].drop_duplicates())))

Duplicate count before: 2
Duplicate count after: 0


In [None]:
fully_combined_data.to_csv('/content/drive/MyDrive/6220 Project/SpamDatasets (1)/processed_data/fully_combined_data.csv', index=False, header=False)

In [None]:
fully_combined_data['Label'].unique()

array([1, 0])