### Instructions

1. **Create a project directory**:
   - First, create a directory where all your project files will be stored. In our case (project_root)
   
2. **Copy-paste the code into the Python file**:
   - Inside the project directory,Copy the provided code and paste it into the Python file.

3. **Run the Python file**:
   - After pasting the code, run the Python file to execute the project.


In [None]:
import os

# Define the root directory of the project
project_root = 'project_root'

# Define the structure of directories and files
structure = {
    'mailing_list_analysis': [
        '__init__.py',
        'data_fetcher.py',
        'preprocessing.py',
        'topic_modeling.py',
        'sentiment_analysis.py',
        'utils.py'
    ],
    '': [
        'main.py'
    ]
}

# Create the directory and file structure
for folder, files in structure.items():
    folder_path = os.path.join(project_root, folder)
    os.makedirs(folder_path, exist_ok=True)
    for file in files:
        open(os.path.join(folder_path, file), 'w').close()

print(f"Project structure created under '{project_root}'")


Project structure created under 'project_root'


In [None]:
import os

# Ensure you're in the project root directory
os.chdir('/content/project_root')

# Verify the change
print("Current working directory:", os.getcwd())

Current working directory: /content/project_root


In [None]:
# mailing_list_analysis/data_fetcher.py

import requests
import re
import os

def fetch_archive_links(base_url, start_year=2015, end_year=2024):
    links = []
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for year in range(start_year, end_year + 1):
        for month in months:
            month_name = f"{year}-{month}"
            link = f"{base_url}{month_name}.txt"
            links.append(link)

    return links

def download_file(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

def fetch_and_save_mailing_list(base_url, output_file, start_year=2015, end_year=2024):
    archive_links = fetch_archive_links(base_url, start_year, end_year)
    all_emails = []

    for link in archive_links:
        print(f"Processing {link}")
        content = download_file(link)
        if content:
            emails = re.split(r'\nFrom ', content)
            all_emails.extend(emails)

    output_directory = os.path.dirname(output_file)
    os.makedirs(output_directory, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        for email in all_emails:
            f.write(email + '\n')
    print(f"Data saved to {output_file}")




In [None]:
# mailing_list_analysis/preprocessing.py

import re
import nltk
from nltk.corpus import stopwords
import time

nltk.download('punkt')
nltk.download('stopwords')

def clean_text_chunk(text_chunk):
    text_chunk = re.sub(r'[\r\n]+', ' ', text_chunk)
    text_chunk = re.sub(r'\s+', ' ', text_chunk)
    text_chunk = re.sub(r'From:.+?Subject:.+?Date:.+?\d{4}', ' ', text_chunk)
    return text_chunk

def tokenize_and_normalize_chunk(text_chunk, stop_words):
    text_chunk = text_chunk.lower()
    words = nltk.word_tokenize(text_chunk)
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

def preprocess_text_file_in_chunks(input_file_path, output_file_path, chunk_size=1024*1024):
    start_time = time.time()
    stop_words = set(stopwords.words('english'))

    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        while True:
            text_chunk = infile.read(chunk_size)
            if not text_chunk:
                break
            documents = text_chunk.split('\nFrom ')
            for document in documents:
                if document.strip():
                    cleaned_chunk = clean_text_chunk(document)
                    processed_words = tokenize_and_normalize_chunk(cleaned_chunk, stop_words)
                    processed_text = ' '.join(processed_words)
                    outfile.write(processed_text + '\n')

    print(f"Preprocessed text saved to '{output_file_path}'.")
    print(f"Total time taken: {time.time() - start_time:.2f} seconds")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# mailing_list_analysis/topic_modeling.py

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from transformers import pipeline

def create_dtm(documents, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(documents)
    return dtm, vectorizer

def run_lda(dtm, n_topics=10, doc_topic_prior=None, topic_word_prior=None, max_iter=1000):
    lda = LDA(n_components=n_topics,
              random_state=42,
              doc_topic_prior=doc_topic_prior,
              topic_word_prior=topic_word_prior,
              max_iter=max_iter)
    lda.fit(dtm)
    return lda

def generate_subtopic_names(lda, vectorizer, n_topics, top_n_words=10):
    ner_pipeline = pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english')
    words = vectorizer.get_feature_names_out()

    subtopics = {}
    for i, topic in enumerate(lda.components_):
        top_words = [words[j] for j in topic.argsort()[-top_n_words:]]
        entities = ner_pipeline(" ".join(top_words))
        unique_entities = {entity['word'] for entity in entities if entity['entity'] in ['ORG', 'PRODUCT', 'LOC', 'MISC']}
        if not unique_entities:
            unique_entities = top_words
        subtopics[i] = " ".join(unique_entities)

    return subtopics

def assign_topics_and_subtopics(df, subtopics):
    df['Subtopic'] = df['Dominant_Topic'].map(subtopics)
    return df



In [None]:
# mailing_list_analysis/sentiment_analysis.py

import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline
import nltk

nltk.download('vader_lexicon')

def calculate_sentiments(text):
    sia = SentimentIntensityAnalyzer()
    vader_score = sia.polarity_scores(text)['compound']
    bert_classifier = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment', max_length=512, truncation=True)
    distilbert_classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', max_length=512, truncation=True)
    bert_label = bert_classifier(text)[0]['label']
    distilbert_label = distilbert_classifier(text)[0]['label']
    return vader_score, bert_label, distilbert_label

def adjust_sentiment(vader_score, bert_label, distilbert_label):
    bert_score = {'1 star': -1, '2 stars': -0.5, '3 stars': 0, '4 stars': 0.5, '5 stars': 1}.get(bert_label, 0)
    distilbert_score = {'NEGATIVE': -1, 'POSITIVE': 1, 'NEUTRAL': 0}.get(distilbert_label, 0)
    combined_score = (vader_score * 0.4 + bert_score * 0.3 + distilbert_score * 0.3) / 1.0

    if combined_score > 0.2:
        return 'POSITIVE'
    elif combined_score < -0.2:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'

def analyze_sentiments(df):
    df['VADER Sentiment'], df['BERT Sentiment'], df['DistilBERT Sentiment'] = zip(*df['Cleaned_Content'].apply(calculate_sentiments))
    df['Adjusted Sentiment'] = df.apply(lambda row: adjust_sentiment(row['VADER Sentiment'], row['BERT Sentiment'], row['DistilBERT Sentiment']), axis=1)
    return df


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
# mailing_list_analysis/utils.py

def print_sentiment_distribution(df):
    subtopic_sentiment = df.groupby('Subtopic')['Adjusted Sentiment'].apply(lambda x: x.mode()[0])
    print("\nSentiment distribution across subtopics:")
    print(subtopic_sentiment)


In [None]:
# mailing_list_analysis/main.py

import sys
import os
import pandas as pd

# Add the project root to the Python path
sys.path.append('/content/project_root')

# Import the necessary modules from your project
from mailing_list_analysis.data_fetcher import fetch_and_save_mailing_list
from mailing_list_analysis.preprocessing import preprocess_text_file_in_chunks
from mailing_list_analysis.topic_modeling import create_dtm, run_lda, generate_subtopic_names, assign_topics_and_subtopics
from mailing_list_analysis.sentiment_analysis import analyze_sentiments

def process_mailing_list(list_name, base_url):
    raw_output_file = f'/content/Dissertation_project/{list_name}_emails_raw.txt'
    preprocessed_output_file = f'/content/Dissertation_project/{list_name}_emails_preprocessed.txt'
    output_results_file = f'/content/Dissertation_project/{list_name}_results.txt'

    # Step 1: Fetch and preprocess data
    fetch_and_save_mailing_list(base_url, raw_output_file)
    preprocess_text_file_in_chunks(raw_output_file, preprocessed_output_file)

    # Step 2: Load the preprocessed data
    with open(preprocessed_output_file, 'r', encoding='utf-8') as f:
        preprocessed_emails = f.readlines()

    df = pd.DataFrame({'Cleaned_Content': preprocessed_emails})
    df = df[df['Cleaned_Content'].str.strip() != '']

    # Step 3: Topic Modeling
    dtm, vectorizer = create_dtm(df['Cleaned_Content'])
    lda_model = run_lda(dtm, n_topics=10, doc_topic_prior=0.1, topic_word_prior=0.01, max_iter=1500)
    subtopics = generate_subtopic_names(lda_model, vectorizer, n_topics=10, top_n_words=15)

    # Assign topics to the dataframe
    df['Dominant_Topic'] = lda_model.transform(dtm).argmax(axis=1)
    df = assign_topics_and_subtopics(df, subtopics)

    # Step 4: Sentiment Analysis
    df = analyze_sentiments(df)

    # Step 5: Write results to a text file
    with open(output_results_file, 'w', encoding='utf-8') as result_file:
        result_file.write(f"Sentiment distribution for {list_name}:\n")

        # Print the dominant topics and their associated subtopics
        result_file.write("Dominant Topics and Associated Subtopics:\n")
        for topic_idx, topic_words in enumerate(lda_model.components_):
            topic_name = " ".join([vectorizer.get_feature_names_out()[i] for i in topic_words.argsort()[-10:]])
            subtopic_name = subtopics[topic_idx]
            result_file.write(f"  Dominant Topic {topic_idx}: {topic_name}\n")
            result_file.write(f"  Subtopic: {subtopic_name}\n\n")

        result_file.write("\nSentiment Analysis by Subtopic:\n")
        for subtopic, group in df.groupby('Subtopic'):
            vader_sentiment = group['VADER Sentiment'].mode()[0]
            bert_sentiment = group['BERT Sentiment'].mode()[0]
            distilbert_sentiment = group['DistilBERT Sentiment'].mode()[0]
            adjusted_sentiment = group['Adjusted Sentiment'].mode()[0]

            result_file.write(f"Subtopic: {subtopic}\n")
            result_file.write(f"  VADER Sentiment: {vader_sentiment}\n")
            result_file.write(f"  BERT Sentiment: {bert_sentiment}\n")
            result_file.write(f"  DistilBERT Sentiment: {distilbert_sentiment}\n")
            result_file.write(f"  Adjusted Sentiment: {adjusted_sentiment}\n\n")

    print(f"Results saved to {output_results_file}")

def main():
    # Define mailing lists to be processed
    mailing_lists = {
        'mobile-sig': 'https://mail.python.org/pipermail/mobile-sig/',
        'email-sig': 'https://mail.python.org/pipermail/email-sig/',
        'education-sig': 'https://mail.python.org/pipermail/edu-sig/',
        'database-sig': 'https://mail.python.org/pipermail/db-sig/'
    }

    # Process each mailing list
    for list_name, base_url in mailing_lists.items():
        print(f"\nProcessing mailing list: {list_name}")
        process_mailing_list(list_name, base_url)

if __name__ == "__main__":
    main()



Processing mailing list: mobile-sig
Processing https://mail.python.org/pipermail/mobile-sig/2015-January.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-February.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-March.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-April.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-May.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-June.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-July.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-August.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-September.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-October.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-November.txt
Processing https://mail.python.org/pipermail/mobile-sig/2015-December.txt
Processing https://mail.python.org/pipermail/mobile-sig/2016-January.txt
Processing https://mail.

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Results saved to /content/Dissertation_project/mobile-sig_results.txt


DOWNLOAD THE RESULT FILE

In [None]:
from google.colab import files

# Specify the path to the CSV file
#file_path = '/content/Dissertation_project/email-sig_results.txt'
#file_path = '/content/Dissertation_project/edu-sig_results.txt'
file_path = '/content/Dissertation_project/mobile-sig_results.txt'
#file_path = '/content/Dissertation_project/db-sig_results.txt'

# Download the file
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>