In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

In [11]:
df = pd.read_csv('../DATA/raw.csv')

In [12]:
# Create binary dummy variable 'email_type'
# email_type = 1 if Email Type == 'Phishing Email' ; email_type = 0 otherwise
df['email_type'] = df['Email Type'].apply(lambda x: 1 if x == "Phishing Email" else 0)

In [13]:
df = df.drop(columns={'index', 'Unnamed: 0', 'Unnamed: 0.1', 'Email Type'})

In [14]:
df = df.rename(columns={"Email Text":"email_text"})

In [15]:
df.columns

Index(['email_text', 'email_type'], dtype='object')

In [14]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [16]:
import nltk

In [17]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/willmayer/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [19]:
sia = SentimentIntensityAnalyzer()

def extract_sentiment(text):
  scores = sia.polarity_scores(text)
  return [scores['neg'], scores['neu'], scores['pos'], scores['compound']]

In [20]:
# Calculate sentiment scores for each email text. Each observation now has features 'email_text', 'email_type', and
# 'neg', 'neu', 'pos', and 'compound', representing email sentiment scores

df['neg'], df['neu'], df['pos'], df['compound'] = zip(*df['email_text'].apply(extract_sentiment))

KeyboardInterrupt: 

In [22]:
df.to_csv('../OUTPUT/sentiment.csv')

In [23]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=5000)

# Transform text data into a dataframe with TF-IDF values
tf_idf_matrix = vectorizer.fit_transform(df['email_text']).toarray()
tf_idf_df = pd.DataFrame(tf_idf_matrix, columns=vectorizer.get_feature_names_out())

# Concatenate the TF-IDF features with the original dataframe to make established dataset
df = pd.concat([df, tf_idf_df], axis=1)

In [26]:
df.to_parquet('../OUTPUT/email.parquet')