In [6]:
# The input dataset should already be the email dataset. 
# We only need three columns "id", "user", and "content" to analyse email content
# We are also using the chunks that we generated using the email_chunk_generator file

# The output dataset has six columns, 2 of which are user and day.
# This script first drops all columns except the three. In this script, we clean the content of the email (i.e removing irrelevant characters, punctuation, and stop words (common words that don't add much meaning to the text) )
# Then we divide the preprocessed text into individual tokens, which are meaningful units of text, typically words or phrases. This step breaks down the text into a format suitable for LDA processing
# Then we construct a document-term matrix (DTM), where each row represents an email and each column represents a term from the vocabulary. The cells of the matrix contain the frequency of each term in each email.
# We then create a vocabulary of unique tokens extracted from the entire email dataset. This vocabulary will be used to represent the words in each email.
# Next, we apply LDA to the DTM to identify hidden topics within the email corpus. LDA assumes that each email is a mixture of these topics, and the model learns the probability distribution of topics for each email.
# For each email, we extract the probability distribution of topics, representing the likelihood of each topic's presence in that email. This topic distribution vector will serve as a numerical representation of the email's content.


# For this dataset calculation, I used HPC Cluster (Magnolia) from University of Southern Mississippi
# In HPC clusters, I used Slrum Workload Manager, the script for which is also discussed somewhere in the repo

import pandas as pd
from multiprocessing import Pool
import os

import ssl




import re
import string
import nltk
# nltk.data.path.append("./packages")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [7]:
temp_folder_results = 'temp_results'  # Temporary folder to store intermediate result files
temp_folder_chunks= 'temp_chunks'  # Folder that stores intermediate chunked files
output_file = 'with_content_probability_distribution.csv'

In [9]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /homes/01/bxbhusal/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /homes/01/bxbhusal/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
# Text cleaning function
def clean_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in tokens if word not in stop_words]
    
    # Apply stemming (using Porter Stemmer)
    stemmer = PorterStemmer()
    stemmed_text = [stemmer.stem(word) for word in filtered_text]
    
    return ' '.join(stemmed_text)

def process_chunk(chunk_filename):
    chunk = pd.read_csv(f'{temp_folder_chunks}/{chunk_filename}')
    
    # Clean text
    
    chunk['clean_content'] = chunk['content'].apply(clean_text)
    
    # Create document-term matrix
    # dtm_chunk = create_dtm_chunk(cleaned_chunk)
    return chunk

    # return dtm_chunk



In [12]:
# First we have to get the list of chunks that we have in the chunks folder
file_names = os.listdir(temp_folder_chunks)
# Filter only files (not directories)
# chunk_filenames = [file for file in file_names if os.path.isfile(os.path.join(temp_folder_chunks, file))]

chunk_filenames = ['temp_chunk_2010-01-04.csv','temp_chunk_2010-01-05.csv']

# Calculate the number of emails sent and received during day and night for each chunk
with Pool() as pool:
    result_filenames = pool.map(process_chunk, chunk_filenames)

# Since our results are divided into different files for each day, we have to combine them
combined_result = pd.concat([pd.read_csv(filename) for filename in result_filenames])

# Save the final result to a CSV file
combined_result.to_csv(output_file, index=False)

KeyError: 'content'