DATA FETCH AND PRE-PROCESS FOR EMAIL MAILING LIST


In [None]:
import requests
import re
import os
import nltk
from nltk.corpus import stopwords
import time
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to fetch archive links based on the provided URL structure
def fetch_archive_links(base_url, start_year=2015, end_year=2024):
    links = []
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for year in range(start_year, end_year + 1):
        for month in months:
            month_name = f"{year}-{month}"
            link = f"{base_url}{month_name}.txt"
            links.append(link)

    return links

# Function to download the file and return its content
def download_file(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

# Main processing function
def fetch_and_save_mailing_list(base_url, output_file, start_year=2015, end_year=2024):
    archive_links = fetch_archive_links(base_url, start_year, end_year)
    all_emails = []

    for link in archive_links:
        print(f"Processing {link}")
        content = download_file(link)
        if content:
            # Split by email messages (simple regex)
            emails = re.split(r'\nFrom ', content)
            all_emails.extend(emails)

    # Ensure the output directory exists
    output_directory = os.path.dirname(output_file)
    os.makedirs(output_directory, exist_ok=True)

    # Save the raw data to a text file for preprocessing
    with open(output_file, 'w', encoding='utf-8') as f:
        for email in all_emails:
            f.write(email + '\n')
    print(f"Data saved to {output_file}")

# Function to clean and normalize text
def clean_text_chunk(text_chunk):
    text_chunk = re.sub(r'[\r\n]+', ' ', text_chunk)  # Remove newlines and carriage returns
    text_chunk = re.sub(r'\s+', ' ', text_chunk)  # Remove multiple spaces
    text_chunk = re.sub(r'From:.+?Subject:.+?Date:.+?\d{4}', ' ', text_chunk)  # Remove email headers and footers
    return text_chunk

def tokenize_and_normalize_chunk(text_chunk, stop_words):
    text_chunk = text_chunk.lower()  # Convert text to lowercase
    words = nltk.word_tokenize(text_chunk)  # Tokenize the text
    words = [word for word in words if word.isalnum() and word not in stop_words]  # Remove stop words and non-alphanumeric tokens
    return words

# Function to preprocess the text file in chunks
def preprocess_text_file_in_chunks(input_file_path, output_file_path, chunk_size=1024*1024):
    start_time = time.time()
    stop_words = set(stopwords.words('english'))

    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        while True:
            text_chunk = infile.read(chunk_size)
            if not text_chunk:
                break
            # Split the chunk into documents/emails
            documents = text_chunk.split('\nFrom ')
            for document in documents:
                if document.strip():
                    cleaned_chunk = clean_text_chunk(document)  # Clean the text chunk
                    processed_words = tokenize_and_normalize_chunk(cleaned_chunk, stop_words)  # Tokenize and normalize the text chunk
                    processed_text = ' '.join(processed_words)  # Join the words back into a single string
                    outfile.write(processed_text + '\n')  # Write each processed document as a separate line

    print(f"Preprocessed text saved to '{output_file_path}'.")
    print(f"Total time taken: {time.time() - start_time:.2f} seconds")

# Fetch, preprocess, and save data for each mailing list
mailing_lists = {

    'email-sig': 'https://mail.python.org/pipermail/email-sig/'
}

for list_name, base_url in mailing_lists.items():
    raw_output_file = f'/content/Dissertation_project/{list_name}_emails_raw.txt'
    preprocessed_output_file = f'/content/Dissertation_project/{list_name}_emails_preprocessed.txt'

    # Fetch and save the mailing list data
    fetch_and_save_mailing_list(base_url, raw_output_file)

    # Preprocess the raw data
    preprocess_text_file_in_chunks(raw_output_file, preprocessed_output_file)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing https://mail.python.org/pipermail/email-sig/2015-January.txt
Processing https://mail.python.org/pipermail/email-sig/2015-February.txt
Processing https://mail.python.org/pipermail/email-sig/2015-March.txt
Processing https://mail.python.org/pipermail/email-sig/2015-April.txt
Processing https://mail.python.org/pipermail/email-sig/2015-May.txt
Processing https://mail.python.org/pipermail/email-sig/2015-June.txt
Processing https://mail.python.org/pipermail/email-sig/2015-July.txt
Processing https://mail.python.org/pipermail/email-sig/2015-August.txt
Processing https://mail.python.org/pipermail/email-sig/2015-September.txt
Processing https://mail.python.org/pipermail/email-sig/2015-October.txt
Processing https://mail.python.org/pipermail/email-sig/2015-November.txt
Processing https://mail.python.org/pipermail/email-sig/2015-December.txt
Processing https://mail.python.org/pipermail/email-sig/2016-January.txt
Processing https://mail.python.org/pipermail/email-sig/2016-February.txt
P

APPLICATION OF LDA

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

# Load the preprocessed data
file_path = '/content/Dissertation_project/email-sig_emails_preprocessed.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    preprocessed_emails = f.readlines()

# Convert to DataFrame for further processing
df = pd.DataFrame({'Cleaned_Content': preprocessed_emails})

# Check for empty strings after preprocessing
df = df[df['Cleaned_Content'].str.strip() != '']

# Vectorize the text data with more topics for finer granularity
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['Cleaned_Content'])

# Fit LDA model with more topics
lda = LDA(n_components=10, random_state=42)
lda.fit(dtm)

# Display the top words in each topic
words = vectorizer.get_feature_names_out()
topic_words = lda.components_

for i, topic in enumerate(topic_words):
    print(f"Top words in topic #{i}:")
    print(" ".join([words[j] for j in topic.argsort()[-10:]]))

# Assign topics to documents
doc_topic_dist = lda.transform(dtm)
df['Dominant_Topic'] = doc_topic_dist.argmax(axis=1)


Top words in topic #0:
different exception thing issue worth simply want needs point date
Top words in topic #1:
different exception thing issue worth simply want needs point date
Top words in topic #2:
object comments simple raised work uses like msg message line
Top words in topic #3:
different exception thing issue worth simply want needs point date
Top words in topic #4:
different exception thing issue worth simply want needs point date
Top words in topic #5:
different exception thing issue worth simply want needs point date
Top words in topic #6:
different exception thing issue worth simply want needs point date
Top words in topic #7:
different exception thing issue worth simply want needs point date
Top words in topic #8:
great creating actually docs libraries read barry post think module
Top words in topic #9:
encoding right use working way module wed header using issues


SUBTOPIC MAPPING

In [None]:
# Example subtopics based on the topics discovered
subtopics = {
    0: 'attachments',
    1: 'compat32',
    2: 'email',
    3: 'python',
    4: 'feedparser',
    5: 'headers',
    6: 'memory',
    7: 'parsing',
    8: 'policy'
    # These subtopics align with your important keywords
}

# Map the subtopics to the DataFrame
df['Subtopic'] = df['Dominant_Topic'].map(subtopics)

# Verify that the 'Subtopic' column exists
if 'Subtopic' in df.columns:
    print("Subtopic column created successfully.")
else:
    print("Failed to create Subtopic column.")


Subtopic column created successfully.


SENTIMENT ANALYSIS USING VADER , BERT , DISTILBERT

In [None]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline
import nltk

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Initialize sentiment analysis tools
sia = SentimentIntensityAnalyzer()
bert_classifier = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment', max_length=512, truncation=True)
distilbert_classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', max_length=512, truncation=True)

# Function to calculate sentiment for a given text
def calculate_sentiments(text):
    vader_score = sia.polarity_scores(text)['compound']
    bert_label = bert_classifier(text)[0]['label']
    distilbert_label = distilbert_classifier(text)[0]['label']
    return vader_score, bert_label, distilbert_label

# Apply sentiment analysis to the DataFrame
df['VADER Sentiment'], df['BERT Sentiment'], df['DistilBERT Sentiment'] = zip(*df['Cleaned_Content'].apply(calculate_sentiments))

# Adjust sentiment scoring
def adjust_sentiment(vader_score, bert_label, distilbert_label):
    bert_score = {'1 star': -1, '2 stars': -0.5, '3 stars': 0, '4 stars': 0.5, '5 stars': 1}.get(bert_label, 0)
    distilbert_score = {'NEGATIVE': -1, 'POSITIVE': 1, 'NEUTRAL': 0}.get(distilbert_label, 0)

    # Increase weight for VADER due to its consistent positive output
    combined_score = (vader_score * 0.6 + bert_score * 0.2 + distilbert_score * 0.2) / 1.0

    if combined_score > 0.2:
        return 'POSITIVE'
    elif combined_score < -0.2:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'

# Apply the adjusted sentiment analysis
df['Adjusted Sentiment'] = df.apply(lambda row: adjust_sentiment(row['VADER Sentiment'], row['BERT Sentiment'], row['DistilBERT Sentiment']), axis=1)

# Group by subtopic to analyze the sentiment distribution
sentiment_analysis = df.groupby('Subtopic').agg({
    'VADER Sentiment': lambda x: x.mode()[0],
    'BERT Sentiment': lambda x: x.mode()[0],
    'DistilBERT Sentiment': lambda x: x.mode()[0],
    'Adjusted Sentiment': lambda x: x.mode()[0]
})

# Save to a text file
output_file = '/content/sentiment_analysis_results.txt'
with open(output_file, 'w') as f:
    f.write("Sentiment analysis across subtopics:\n")
    f.write("====================================\n")
    for subtopic, row in sentiment_analysis.iterrows():
        f.write(f"Subtopic: {subtopic}\n")
        f.write(f"  VADER Sentiment: {row['VADER Sentiment']}\n")
        f.write(f"  BERT Sentiment: {row['BERT Sentiment']}\n")
        f.write(f"  DistilBERT Sentiment: {row['DistilBERT Sentiment']}\n")
        f.write(f"  Adjusted Sentiment: {row['Adjusted Sentiment']}\n")
        f.write("\n")

print(f"Results saved to {output_file}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Results saved to /content/sentiment_analysis_results.txt


DOWNLOAD THE RESULT FILE

In [None]:
from google.colab import files

# Specify the path to the CSV file
file_path = '/content/sentiment_analysis_results.txt'

# Download the file
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>