In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
nltk.download("vader_lexicon")

In [None]:
df = pd.read_csv('../DATA/raw.csv')

In [None]:
# Create binary dummy variable 'email_type'
# email_type = 1 if Email Type == 'Phishing Email' ; email_type = 0 otherwise
df['email_type'] = df['Email Type'].apply(lambda x: 1 if x == "Phishing Email" else 0)

In [None]:
df = df.drop(columns={'index', 'Unnamed: 0', 'Unnamed: 0.1', 'Email Type'})

In [None]:
df = df.rename(columns={"Email Text":"email_text"})

In [None]:
sia = SentimentIntensityAnalyzer()

def extract_sentiment(text):
  scores = sia.polarity_scores(text)
  return [scores['neg'], scores['neu'], scores['pos'], scores['compound']]

In [None]:
# Calculate sentiment scores for each email text. Each observation now has features 'email_text', 'email_type', and
# 'neg', 'neu', 'pos', and 'compound', representing email sentiment scores

df['neg'], df['neu'], df['pos'], df['compound'] = zip(*df['email_text'].apply(extract_sentiment))

In [None]:
df.to_csv('../DATA/sentiment.csv')

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=5000)

# Transform text data into a dataframe with TF-IDF values
tf_idf_matrix = vectorizer.fit_transform(df['email_text']).toarray()
tf_idf_df = pd.DataFrame(tf_idf_matrix, columns=vectorizer.get_feature_names_out(), index=df.index)

# Drop 'email_text' since it's now encoded in TF-IDF features
df = df.drop(columns=['email_text'])

# Concatenate the TF-IDF features with the original dataframe to make established dataset
df = pd.concat([df, tf_idf_df], axis=1)

In [11]:
df.to_parquet('../DATA/email.parquet')