<a href="https://colab.research.google.com/github/anastasiaarsky/ML_Capstone/blob/main/DataWrangling%26Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import glob
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np
import pandas as pd
import re
import string
import time

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DATA_PATH = "/content/drive/My Drive/UCSD Machine Learning Engineering Bootcamp/Capstone Project/"

In [4]:
# Load data to CSV
df = pd.read_csv('Data.csv')
#fdf = pd.read_csv('Full_data.csv')

## Text Preprocessing

In [5]:
# Combine Subject and Message into Full Text
df['Full_Text'] = df["Subject"].map(str) + '. ' + df["Message"].map(str)

In [6]:
# takes a df column and returns a normalized list of strings
# (each string in the list represents one email/sample)
#
# Normalization process:
# transforms each token to lower case, converts URLs to the string 'URL',
# converts emails to the string 'email', converts numbers to the string 'number',
# and removes extra newlines, whitespace, and stopwords
def normalize_corpus(col):
    norm_corpus = []
    for text in col:
      text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', 'URL', text)
      text = re.sub(r'<\S+@\S+>', 'email', text)
      text = re.sub(r'[0-9]+','number', text)
      text = text.lower()
      text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
      text = re.sub(r' +', ' ', text)
      words = text.split()
      # words = [word.strip(string.punctuation) for word in words]
      text = ' '.join([word for word in words if len(word) > 0 if word not in set(stopwords.words())])
      norm_corpus.append(text)
    return norm_corpus

In [7]:
# pre-process Full Text column in grou[s] and save as Clean Text

# preprocess_data takes an index number and df and returns a cleaned version of
# the df
# also keeps track of the execution time
def preprocess_data(index, group):
  start_time = time.time()
  group['Clean_Text'] = normalize_corpus(group['Full_Text'])
  print("Execution time for Group %d: %.3f ms" % (index, time.time() - start_time))
  return group

# split df into twenty groups
df_split = np.array_split(df, 10)

In [None]:
# call preprocess_data on group 1
group1 = preprocess_data(1, df_split[0])
group1.to_csv(DATA_PATH + 'Groups/Group1.csv', index=False)

In [None]:
# call preprocess_data on group 2
group2 = preprocess_data(2, df_split[1])
group2.to_csv(DATA_PATH + 'Groups/Group2.csv', index=False)

In [None]:
# call preprocess_data on group 3
group3 = preprocess_data(3, df_split[2])
group3.to_csv(DATA_PATH + 'Groups/Group3.csv', index=False)

In [None]:
# call preprocess_data on group 4
group4 = preprocess_data(4, df_split[3])
group4.to_csv(DATA_PATH + 'Groups/Group4.csv', index=False)

In [None]:
# call preprocess_data on group 5
group5 = preprocess_data(5, df_split[4])
group5.to_csv(DATA_PATH + 'Groups/Group5.csv', index=False)

In [None]:
# call preprocess_data on group 6
group6 = preprocess_data(6, df_split[5])
group6.to_csv(DATA_PATH + 'Groups/Group6.csv', index=False)

In [None]:
# call preprocess_data on group 7
group7 = preprocess_data(7, df_split[6])
group7.to_csv(DATA_PATH + 'Groups/Group7.csv', index=False)

In [None]:
# call preprocess_data on group 8
group8 = preprocess_data(8, df_split[7])
group8.to_csv(DATA_PATH + 'Groups/Group8.csv', index=False)

In [None]:
# call preprocess_data on group 9
group9 = preprocess_data(9, df_split[8])
group9.to_csv(DATA_PATH + 'Groups/Group9.csv', index=False)

In [None]:
# call preprocess_data on group 10
group10 = preprocess_data(10, df_split[9])
group10.to_csv(DATA_PATH + 'Groups/Group10.csv', index=False)

In [None]:
# concatenate all the groups into clean_df

# Get CSV files list from Groups folder
path = '/content/drive/My Drive/UCSD Machine Learning Engineering Bootcamp/Capstone Project/Groups'
csv_files = glob.glob(path + "/*.csv")

# Read each CSV file into DataFrame, creating a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

# Concatenate all DataFrames
clean_df = pd.concat(df_list, ignore_index=True)
clean_df = clean_df[['Label', 'Clean_Text', 'Full_Text']]

In [None]:
 # Show a sample email
clean_df.iloc[1][['Full_Text', 'Clean_Text']].to_dict()

In [None]:
clean_df = clean_df[['Label', 'Clean_Text']]

In [None]:
# Check number of missing values
clean_df.isna().sum()

Label         0
Clean_Text    0
dtype: int64

In [None]:
# Drop missing values
clean_df = clean_df.dropna(axis=0)
clean_df = clean_df.reset_index(drop=True)
clean_df.head()

In [None]:
# Save to CSV
clean_df.to_csv(DATA_PATH + 'CleanData.csv', index=False)

## Data Exploration

In [None]:
clean_df.info()

### Label Insights

In [None]:
# Check for balanced data
clean_df.label.value_counts().plot.pie(autopct='%1.1f%%',shadow=True,explode=[0.1,0.1])

### Text Insights

In [None]:
# Quick summary
clean_df['Clean_Text'].describe()

(count                                    39206
 unique                                   28256
 top       schedule crawler : hourahead failure
 freq                                       185
 Name: Subject, dtype: object,
 count                          39142
 unique                         35412
 top       click here to be removed\n
 freq                              65
 Name: Message, dtype: object)

In [None]:
# Graph of email word length
clean_df['Length'] = clean_df['Clean_Text'].apply(len)
clean_df['Length'].plot(bins=50, kind='hist',figsize=(10,7))

Label      0
Email    627
dtype: int64

In [None]:
clean_df.hist(column='Length', by='Label', bins=50, figsize=(15,8))

Unnamed: 0,Label,Email
650,0.0,
1349,0.0,
2226,0.0,
2353,0.0,
3331,0.0,
...,...,...
38630,1.0,
38632,1.0,
38695,1.0,
38723,1.0,


In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

-name identification
-word picture
-most frequent words
-number of urls/numbers