## Importing Libraries and the Dataset

In [None]:
!pip install lexicalrichness

In [None]:
!pip install rapidfuzz

In [None]:
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
from sklearn.cluster import DBSCAN
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import en_core_web_sm
import csv, sys
from google.colab import drive
import os
from lexicalrichness import LexicalRichness
from rapidfuzz import process, fuzz
import time
from itertools import chain

In [None]:
nltk.download('stopwords')

In [None]:
nlp_lemma = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
nlp_ner = spacy.load("en_core_web_sm", disable=["parser", "textcat"])

### Importing the dataset

In [None]:
csv.field_size_limit(sys.maxsize)

In [None]:
drive.mount('/content/drive')

In [None]:
df = pd.read_csv(
    '/content/drive/MyDrive/NLP_Project/df_clean.csv',
    sep='§')

print("Loaded rows:", len(df))
df.head()

## Exploratory Data Analysis (EDA)

### Feature Engineering and Tokenization

In [None]:
df['char_length'] = df['Cleaned_Body_n'].astype(str).str.len()
df['word_length'] = df['Cleaned_Body_n'].astype(str).str.split().str.len()
df['avg_word_length'] = df.apply(
    lambda row: row['char_length'] / row['word_length'] if row['word_length'] > 0 else 0,
    axis=1)

In [None]:
stop_words = set(stopwords.words('english'))

# Tokenize words
def tokenize(text):
    if pd.isna(text):
        return []
    tokens = re.findall(r'\b\w+\b', text.lower())  # ignore punctuation
    return [t for t in tokens if t not in stop_words and len(t) > 1]

# Tokenization and count
df['tokens'] = df['Cleaned_Body_n'].apply(tokenize)

In [None]:
level_order = ["High", "Medium", "Low"]
level_colors = {
    'High': '#1f77b4',
    'Medium': '#ff7f0e',
    'Low': '#2ca02c'
}

In [None]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [None]:
df['year_month'] = df['Date'].dt.to_period('M')

### Summary Statistics

In [None]:
people_per_level = df[['From', 'Level']].drop_duplicates().groupby('Level').size().reset_index()
people_per_level

In [None]:
df.groupby('Level')['char_length'].mean()

In [None]:
df.groupby('Level')['word_length'].mean()

In [None]:
summary_stats = pd.DataFrame({
    'Total Emails': df.groupby('Level').size(),
    'Unique Senders': df.groupby('Level')['From'].nunique(),
    'Average Length (Chars)': df.groupby('Level')['char_length'].mean(),
    'Average Length (Words)': df.groupby('Level')['word_length'].mean(),
    'Average Word Length': df.groupby('Level')['avg_word_length'].mean(),
    'Avg Tokens per Email': df['tokens'].apply(len).groupby(df['Level']).mean(),}).round(2)

summary_stats = summary_stats.reindex(['High', 'Medium', 'Low'])

# Plots
fig, axes = plt.subplots(1, summary_stats.shape[1], figsize=(20, 4))

for i, column in enumerate(summary_stats.columns):
    axes[i].bar(summary_stats.index, summary_stats[column],
        color=[level_colors[level] for level in summary_stats.index])
    axes[i].set_title(column, fontsize=10)
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_ylabel('Value')

plt.suptitle('Email Summary Statistics by Hierarchical Level', fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

### Lexical Analysis

###### Word frequency

In [None]:
def lemmatize_docs(docs, batch_size=512, n_process=4):
    all_lemmas = []
    for doc in nlp_lemma.pipe(docs, batch_size=batch_size, n_process=n_process):
        lemmas = [
            token.lemma_.lower()
            for token in doc
            if token.is_alpha and not token.is_stop and len(token) > 1
        ]
        all_lemmas.append(lemmas)
    return all_lemmas

In [None]:
texts = df["Cleaned_Body_n"].dropna().astype(str)
chunk_size = 8000
lemmas_combined = []

for start in range(0, len(texts), chunk_size):
    end = min(start + chunk_size, len(texts))
    chunk = texts.iloc[start:end].tolist()

    lemmas_chunk = lemmatize_docs(chunk, batch_size=512, n_process=4)
    lemmas_combined.extend(lemmas_chunk)

    print(f"Chunk completed: {start} → {end}")

# Assign lemmas to columns
df.loc[texts.index, "tokens_lemmatized"] = pd.Series(lemmas_combined, index=texts.index)

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

levels = df['Level'].dropna().unique()

for lvl in levels:
    # flatten tokens & lemmas for this level
    tokens = [t for sub in df.loc[df['Level'] == lvl, 'tokens'].dropna() for t in sub]
    lemmas = [l for sub in df.loc[df['Level'] == lvl, 'tokens_lemmatized'].dropna() for l in sub]

    top_tokens = Counter(tokens).most_common(20)
    top_lemmas = Counter(lemmas).most_common(20)

    words, wf = zip(*top_tokens)
    lem, lf = zip(*top_lemmas)

    fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharex=False)
    # Left: tokens
    axes[0].barh(words, wf)
    axes[0].invert_yaxis()
    axes[0].set_title(f'{lvl} Level: Top 20 Tokens')
    axes[0].set_xlabel('Frequency')

    # Right: lemmas
    axes[1].barh(lem, lf)
    axes[1].invert_yaxis()
    axes[1].set_title(f'{lvl} Level: Top 20 Lemmas')
    axes[1].set_xlabel('Frequency')

    fig.suptitle(f'Tokens vs Lemmas at {lvl} Level', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

In [None]:
#single graph for all levels
valid_lemmas = df['tokens_lemmatized'].dropna()
all_lemmas   = [lemma for sublist in valid_lemmas for lemma in sublist]
lemma_freq   = Counter(all_lemmas)
wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(lemma_freq)

plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of most frequent lemmas')
plt.show()

###### Semantic categories

In [None]:
def keyword_group_rate(df, keyword_list, group_name):
    pattern = r'\b(?:' + '|'.join(map(re.escape, keyword_list)) + r')\b'
    col_name = f'has_{group_name}'
    df[col_name] = df['Cleaned_Body_n'].str.contains(pattern, case=False, na=False)
    return df.groupby('Level')[col_name].mean().rename(group_name)

# Groups
Courtesy = ['please', 'thanks', 'thank', 'regards', 'sincerely', 'appreciate', 'best']
Orders = ['confirm', 'forward', 'schedule', 'prepare', 'submit', 'review', 'organize','must','fix', 'should']
Crisis = ['crisis', 'issue', 'problem', 'concern', 'urgent', 'delay', 'fail', 'risk', 'pressure']

# Frequency within each category
results_grouped = pd.concat([
    keyword_group_rate(df, Courtesy, 'courtesy'),
    keyword_group_rate(df, Orders, 'orders'),
    keyword_group_rate(df, Crisis, 'crisis')], axis=1)

display(results_grouped.T)

In [None]:
# Bar chart with frequency within each semantic group of word

results_plot = results_grouped.T
results_plot = results_plot[level_order]

results_plot.plot(kind='bar', figsize=(10, 6))
plt.title('Proportion of Emails per Word Category by Hierarchical Level')
plt.xlabel('Word Category')
plt.ylabel('Proportion of Emails')
plt.xticks(rotation=0)
plt.legend(title='Hierarchical Level')
plt.tight_layout()
plt.show()

###### Lexical Diversity

In [None]:
# Compute MTLD
def compute_mtld(tokens):
    if not tokens:
        return 0
    text = ' '.join(tokens)
    return LexicalRichness(text).mtld()

df['mtld'] = df['tokens'].apply(compute_mtld)

mtld_stats = (
    df.groupby('Level')['mtld']
      .mean()
      .round(2)
      .reindex(level_order)
)

In [None]:
# Plot
title = 'Average MTLD by Hierarchical Level'
plt.figure(figsize=(6, 4))
plt.bar(
    mtld_stats.index,
    mtld_stats.values,
    color=[level_colors[level] for level in level_order]
)
plt.title(title)
plt.xlabel('Level')
plt.ylabel('MTLD')
plt.ylim(0, mtld_stats.max() * 1.1)
plt.tight_layout()
plt.show()

In [None]:
# Compute MATTR
def compute_mattr(tokens, window=25):
    if not tokens or len(tokens) < window:
        return None
    text = ' '.join(tokens)
    return LexicalRichness(text).mattr(window_size=window)

# Apply
df['mattr'] = df['tokens'].apply(lambda x: compute_mattr(x))

mattr_stats = (
    df.groupby('Level')['mattr']
      .mean()
      .round(2)
      .reindex(level_order)
)

In [None]:
# Plot
plt.figure(figsize=(6, 4))
plt.bar(
    level_order,
    mattr_stats.values,
    color=[level_colors[level] for level in level_order]
)
plt.title('Average MATTR by Hierarchical Level')
plt.xlabel('Level')
plt.ylabel('MATTR')
plt.ylim(0, mattr_stats.max() * 1.1)
plt.tight_layout()
plt.show()

#### NER

In [None]:
def extract_entities(texts):
    return [
        [(ent.text, ent.label_) for ent in doc.ents]
        for doc in nlp_ner.pipe(texts, batch_size=512, n_process=8)]

In [None]:
df["Cleaned_Body_n"] = df["Cleaned_Body_n"].fillna("")
texts = df["Cleaned_Body_n"].astype(str).tolist()

chunk_size = 10000
all_entities = []

for start in range(0, len(texts), chunk_size):
    end = min(start + chunk_size, len(texts))
    chunk = texts[start:end]

    print(f"Processing chunk: {start}–{end}...")
    t0 = time.time()

    chunk_entities = extract_entities(chunk)

    print(f"Done in {time.time() - t0:.2f}s")
    all_entities.extend(chunk_entities)


In [None]:
flat_entities = list(chain.from_iterable(all_entities))

entity_counts = Counter(label for _, label in flat_entities)

print(entity_counts.most_common())

In [None]:
org_counts = Counter(ent for ent in flat_entities if ent[1] == "ORG")

print(org_counts.most_common(10))

Since there is more than one organization referring to Enron we have decided to normalize raw organization names extracted by the NER process, collapsing variants like “Enron Corp.” or “ENRON Direct” into a single form (“enron”). It then recounts mentions of each organization and plots the Top 10 by frequency, giving a more accurate view of organization mentions in our corpus.


In [None]:
def normalize_org(name: str) -> str:
    name = name.lower().strip()
    name = name.replace("\\", "").replace('"', "").replace("'", "")
    for suf in [r"\binc\b", r"\bcorp\b", r"\bcompany\b", r"\bco\b",
                r"\bltd\b", r"'s\b", r"\bonline\b"]:
        name = re.sub(suf, "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"^enron\b.*", "enron", name)
    return name

In [None]:
raw_names = [t for t, lbl in flat_entities if lbl == "ORG"]

# Normalize
normalized = [normalize_org(n) for n in raw_names]
normalized = [n for n in normalized if n.strip() != ""]

blacklist = {"fyi", "log", "re", "fw", "hi", "please", "thanks", "en"}

filtered = [n for n in normalized if n not in blacklist and len(n) > 2]
org_counts = Counter(filtered)

In [None]:
plot_df = (pd.DataFrame(org_counts.most_common(10), columns=["Organization","Mentions"]).sort_values("Mentions", ascending=False))

plt.figure(figsize=(8, 5))
sns.barplot(data=plot_df, x="Mentions", y="Organization", color="steelblue")
plt.title("Top 10 Named Organizations")
plt.xlabel("Mentions")
plt.ylabel("Organization")
plt.tight_layout()
plt.show()

### Email verbosity

In [None]:
q_low, q_high = df['word_length'].quantile([0.25, 0.75])

def verbosity_label(n):
    if n <= q_low:
        return "Concise"
    elif n >= q_high:
        return "Verbose"
    else:
        return "Normal"

df['verbosity'] = df['word_length'].apply(verbosity_label)

In [None]:
print(df.columns.tolist())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

counts_lvl = (
    df
    .groupby(['Level', 'verbosity'])
    .size()
    .unstack(fill_value=0)
)

verbosity_order = ['Verbose', 'Concise', 'Normal']

counts_lvl = counts_lvl.reindex(
    index=level_order,
    columns=verbosity_order,
    fill_value=0
)

props_lvl = counts_lvl.div(counts_lvl.sum(axis=1), axis=0)

props_transposed = props_lvl.T.reindex(
    index=verbosity_order,
    columns=level_order
)

fig, ax = plt.subplots(figsize=(10, 6))
props_transposed.plot(
    kind='bar',
    ax=ax,
    width=0.8
)

ax.set_xlabel("Verbosity Category")
ax.set_ylabel("Proportion of Emails")
ax.set_title("Proportion of Emails per Level\nacross Verbosity Categories")
ax.legend(title="Hierarchical Level", bbox_to_anchor=(1.02, 1), loc='upper left')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

### Temporal Analysis

In [None]:
active_users = (df.groupby(['year_month', 'Level'])['From'].nunique().unstack())

emails_by_month_and_level = df.groupby(['year_month', 'Level']).size().unstack()

normalized_per_active_user = emails_by_month_and_level.divide(active_users)

normalized_per_active_user = normalized_per_active_user.reindex(columns=level_order)

colors = [level_colors[level] for level in level_order]

# Plot
ax = normalized_per_active_user.plot(
    kind='line',
    figsize=(12, 5),
    marker='o',
    color=colors
)
plt.title("Number of emails over time")
plt.xlabel("Month")
plt.ylabel("Number of emails")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
avg_length_by_level = (df.groupby(['year_month', 'Level'])['word_length'].mean().unstack())

avg_length_by_level = avg_length_by_level.reindex(columns=level_order)

colors = [level_colors[level] for level in level_order]

ax = avg_length_by_level.plot(
    kind='line',
    figsize=(12, 5),
    marker='o',
    color=colors
)

plt.title("Average email length (in words) by level over time")
plt.xlabel("Month")
plt.ylabel("Average number of words")
plt.grid(True)
plt.tight_layout()
plt.legend(title='Level')
plt.show()

In [None]:
crisis_by_level = (
    df
    .groupby(['year_month', 'Level'])['has_crisis']
    .mean()
    .unstack(fill_value=0)
)

crisis_by_level = crisis_by_level.reindex(columns=level_order)

colors = [level_colors[level] for level in level_order]

ax = crisis_by_level.plot(
    kind='line',
    figsize=(14, 6),
    marker='o',
    color=colors
)

plt.title("Proportion of emails containing crisis-related words by level over time")
plt.xlabel("Month")
plt.ylabel("Proportion of crisis emails")
plt.grid(True)
plt.legend(title='Level')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Group by mail and verbosity
counts = (
    df
    .groupby(['year_month','verbosity'])
    .size()
    .unstack(fill_value=0)
    .sort_index()
)

# Plot
ax = counts.plot(
    figsize=(14, 6),
    marker='o'
)
ax.set_title("Number of emails by verbosity level over time")
ax.set_xlabel("Mese")
ax.set_ylabel("Numero di email")
ax.grid(True)
ax.legend(title='Verbosity')
plt.tight_layout()
plt.show()

### Pre Topic Modelling Analysis

In [None]:
raw_docs = df['Cleaned_Body_n'].dropna().astype(str).tolist()

valid_lemmas = df["tokens_lemmatized"].dropna()
documents_str = [" ".join(tokens) for tokens in valid_lemmas]

# Vectorize strings
vectorizer = CountVectorizer(
    lowercase=True,
    stop_words='english',
    min_df=5,
    max_df=0.6,
    max_features=10000  # limit to most frequent 10k terms
)

dtm = vectorizer.fit_transform(documents_str)

# Extract vocab & frequencies
vocab = vectorizer.get_feature_names_out()
print(f"Vocabulary size after lemmatization: {len(vocab)}")

word_counts = np.asarray(dtm.sum(axis=0)).flatten()
freq_df = pd.DataFrame({'term': vocab, 'count': word_counts}).sort_values('count', ascending=False)

print("\nTop 10 most frequent lemmatized words:")
print(freq_df.head(10))

print("\nTop 10 least frequent lemmatized words (but ≥ min_df):")
print(freq_df.tail(10))

total_elements   = dtm.shape[0] * dtm.shape[1]
nonzero_elements = dtm.nnz
sparsity         = 1.0 - (nonzero_elements / total_elements)
print(f"\nSparsity of the lemmatized document-term matrix: {sparsity:.4f}")

In [None]:
df['tokens_str'] = df['tokens'].str.join(' ')
df['tokens_lemmatized_str'] = df['tokens_lemmatized'].str.join(' ')

### Saving Dataset

In [None]:
drive_path = '/content/drive/MyDrive/NLP_Project/data.csv'

df.to_csv(drive_path,sep='§')