In [None]:
import requests
import re
import os

# Function to fetch archive links based on the provided URL structure
def fetch_archive_links(base_url, start_year=2015, end_year=2024):
    links = []
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for year in range(start_year, end_year + 1):
        for month in months:
            month_name = f"{year}-{month}"
            link = f"{base_url}{month_name}.txt"
            links.append(link)

    return links

# Function to download the file and return its content
def download_file(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

# Main processing function
def fetch_and_save_mailing_list(base_url, output_file, start_year=2015, end_year=2024):
    archive_links = fetch_archive_links(base_url, start_year, end_year)
    all_emails = []

    for link in archive_links:
        print(f"Processing {link}")
        content = download_file(link)
        if content:
            # Split by email messages (simple regex)
            emails = re.split(r'\nFrom ', content)
            all_emails.extend(emails)

    # Save the raw email content to a text file
    with open(output_file, 'w', encoding='utf-8') as f:
        for email in all_emails:
            f.write("========== EMAIL ==========\n")
            f.write(email)
    print(f"Data saved to {output_file}")

# Define base URL and output file path
base_url = 'https://mail.python.org/pipermail/db-sig/'
raw_output_file = '/content/db_emails_raw.txt'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(raw_output_file), exist_ok=True)

# Fetch and save the mailing list data
fetch_and_save_mailing_list(base_url, raw_output_file)


Processing https://mail.python.org/pipermail/db-sig/2015-January.txt
Processing https://mail.python.org/pipermail/db-sig/2015-February.txt
Processing https://mail.python.org/pipermail/db-sig/2015-March.txt
Processing https://mail.python.org/pipermail/db-sig/2015-April.txt
Processing https://mail.python.org/pipermail/db-sig/2015-May.txt
Processing https://mail.python.org/pipermail/db-sig/2015-June.txt
Processing https://mail.python.org/pipermail/db-sig/2015-July.txt
Processing https://mail.python.org/pipermail/db-sig/2015-August.txt
Processing https://mail.python.org/pipermail/db-sig/2015-September.txt
Processing https://mail.python.org/pipermail/db-sig/2015-October.txt
Processing https://mail.python.org/pipermail/db-sig/2015-November.txt
Processing https://mail.python.org/pipermail/db-sig/2015-December.txt
Processing https://mail.python.org/pipermail/db-sig/2016-January.txt
Processing https://mail.python.org/pipermail/db-sig/2016-February.txt
Processing https://mail.python.org/pipermai

SENTIMENT ANALYSIS USING TEXTBLOB

In [None]:
import re
from textblob import TextBlob
import pandas as pd

# Load the email data from the newly created file
file_path_new_mobile = '/content/db_emails_raw.txt'
with open(file_path_new_mobile, 'r') as file:
    email_data_new_mobile = file.read()

# Split the data into individual emails
emails_new_mobile = re.split(r"========== EMAIL ==========\n", email_data_new_mobile)

# Define important keywords/topics for analysis in a mobile development context
important_keywords_mobile = ['database']

# Extract date, topic, and sentiment details from the new dataset
def extract_date_topic_sentiment_mobile(emails, keywords):
    discussion_details = []

    for email in emails:
        # Extract the date of the email
        date_match = re.search(r'\nDate: (.+)\n', email)
        if date_match:
            email_date = date_match.group(1).strip()
            try:
                email_date = pd.to_datetime(email_date, utc=True)
            except ValueError:
                continue  # Skip email if date format is not recognized

            # Analyze sentiment of the email
            blob = TextBlob(email)
            sentiment = blob.sentiment.polarity
            sentiment_type = 'neutral'
            if sentiment > 0:
                sentiment_type = 'positive'
            elif sentiment < 0:
                sentiment_type = 'negative'

            # Check for mentions of important topics
            for keyword in keywords:
                if keyword in email.lower():
                    discussion_details.append({
                        'Date': email_date,
                        'Topic': keyword,
                        'Sentiment': sentiment_type
                    })

    return discussion_details

# Perform the extraction
discussion_details_new_mobile = extract_date_topic_sentiment_mobile(emails_new_mobile, important_keywords_mobile)

# Convert to DataFrame for further analysis
df_discussion_details_new_mobile = pd.DataFrame(discussion_details_new_mobile)

# Group the data by year and topic to analyze the discussion distribution
df_discussion_details_new_mobile['Year'] = df_discussion_details_new_mobile['Date'].dt.year
discussion_by_year_new_mobile = df_discussion_details_new_mobile.groupby(['Year', 'Topic']).size().unstack(fill_value=0)

# Overall sentiment analysis
overall_sentiment_mobile = df_discussion_details_new_mobile.groupby('Topic')['Sentiment'].value_counts().unstack(fill_value=0)

# Combine the number of mentions and sentiment data for each topic
combined_data_mobile = pd.concat([discussion_by_year_new_mobile.sum(axis=0), overall_sentiment_mobile], axis=1).fillna(0)

# Adjust the number of column names based on the actual columns in the DataFrame
combined_data_mobile.columns = ['Total Mentions'] + list(overall_sentiment_mobile.columns)

# Save the results to a text file
output_file = '/content/db_discussion_details.txt'
with open(output_file, 'w') as f:
    f.write("Combined Topic Analysis for DB Mailing List\n")
    f.write("===============================================\n\n")

    f.write("Overall Combined Data:\n")
    for topic, data in combined_data_mobile.iterrows():
        f.write(f"Topic: {topic}\n")
        f.write(f"  Total Mentions: {int(data['Total Mentions'])}\n")
        for sentiment in overall_sentiment_mobile.columns:
            f.write(f"  {sentiment.capitalize()} Sentiment: {int(data[sentiment])}\n")
        f.write("\n")
print(f"Results saved to {output_file}")


Results saved to /content/db_discussion_details.txt


DOWNLOAD THE RESULT FILE

In [None]:
from google.colab import files
files.download(output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SENTIMENT ANALYSIS USING VADER

In [None]:
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
import nltk

# Download the VADER lexicon
nltk.download('vader_lexicon')

# Load the email data from the newly created file
file_path_new_mobile = '/content/db_emails_raw.txt'
with open(file_path_new_mobile, 'r') as file:
    email_data_new_mobile = file.read()

# Split the data into individual emails
emails_new_mobile = re.split(r"========== EMAIL ==========\n", email_data_new_mobile)

# Define important keywords/topics for analysis in a mobile development context
important_keywords_mobile = ['database']


# Initialize VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Extract date, topic, and sentiment details from the new dataset
def extract_date_topic_sentiment_mobile(emails, keywords):
    discussion_details = []

    for email in emails:
        # Extract the date of the email
        date_match = re.search(r'\nDate: (.+)\n', email)
        if date_match:
            email_date = date_match.group(1).strip()
            try:
                email_date = pd.to_datetime(email_date, utc=True)
            except ValueError:
                continue  # Skip email if date format is not recognized

            # Analyze sentiment of the email using VADER
            sentiment_score = sia.polarity_scores(email)
            sentiment_type = 'neutral'
            if sentiment_score['compound'] > 0:
                sentiment_type = 'positive'
            elif sentiment_score['compound'] < 0:
                sentiment_type = 'negative'

            # Check for mentions of important topics
            for keyword in keywords:
                if keyword in email.lower():
                    discussion_details.append({
                        'Date': email_date,
                        'Topic': keyword,
                        'Sentiment': sentiment_type
                    })

    return discussion_details

# Perform the extraction
discussion_details_new_mobile = extract_date_topic_sentiment_mobile(emails_new_mobile, important_keywords_mobile)

# Convert to DataFrame for further analysis
df_discussion_details_new_mobile = pd.DataFrame(discussion_details_new_mobile)

# Group the data by year and topic to analyze the discussion distribution
df_discussion_details_new_mobile['Year'] = df_discussion_details_new_mobile['Date'].dt.year
discussion_by_year_new_mobile = df_discussion_details_new_mobile.groupby(['Year', 'Topic']).size().unstack(fill_value=0)

# Overall sentiment analysis
overall_sentiment_mobile = df_discussion_details_new_mobile.groupby('Topic')['Sentiment'].value_counts().unstack(fill_value=0)

# Combine the number of mentions and sentiment data for each topic
combined_data_mobile = pd.concat([discussion_by_year_new_mobile.sum(axis=0), overall_sentiment_mobile], axis=1).fillna(0)

# Inspect combined_data_mobile to understand its structure
print("Columns in combined_data_mobile:")
print(combined_data_mobile.columns)

# Assign appropriate labels based on the number of columns
if len(combined_data_mobile.columns) == 4:
    combined_data_mobile.columns = ['Total Mentions', 'Negative Sentiment', 'Neutral Sentiment', 'Positive Sentiment']
elif len(combined_data_mobile.columns) == 3:
    combined_data_mobile.columns = ['Total Mentions', 'Negative Sentiment', 'Positive Sentiment']

# Save the results to a text file
output_file = '/content/db_discussion_details_vader.txt'
with open(output_file, 'w') as f:
    f.write("Combined Topic Analysis for DB Development Mailing List\n")
    f.write("===============================================\n\n")

    f.write("Overall Combined Data:\n")
    for topic, data in combined_data_mobile.iterrows():
        f.write(f"Topic: {topic}\n")
        f.write(f"  Total Mentions: {int(data['Total Mentions'])}\n")
        f.write(f"  Negative Sentiment: {int(data['Negative Sentiment'])}\n")
        if 'Neutral Sentiment' in data:
            f.write(f"  Neutral Sentiment: {int(data['Neutral Sentiment'])}\n")
        f.write(f"  Positive Sentiment: {int(data['Positive Sentiment'])}\n\n")
print(f"Results saved to {output_file}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Columns in combined_data_mobile:
Index([0, 'negative', 'positive'], dtype='object')
Results saved to /content/db_discussion_details_vader.txt


DOWNLOAD THE RESULT FILE

In [None]:
from google.colab import files
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SENTIMENT ANALYSIS USING BERT AND DISTILBERT

In [None]:
import re
from transformers import pipeline
import pandas as pd
import torch

# Detect if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Load the email data from the newly created file
file_path_new_mobile = '/content/db_emails_raw.txt'
with open(file_path_new_mobile, 'r') as file:
    email_data_new_mobile = file.read()

# Split the data into individual emails
emails_new_mobile = re.split(r"========== EMAIL ==========\n", email_data_new_mobile)

# Define important keywords/topics for analysis in a mobile development context
important_keywords_mobile = ['database']


# Initialize BERT and DistilBERT sentiment analysis pipelines with truncation
bert_classifier = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment', device=device, max_length=512, truncation=True)
distilbert_classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', device=device, max_length=512, truncation=True)

# Extract date, topic, and sentiment details from the new dataset using BERT and DistilBERT
def extract_date_topic_sentiment_mobile(emails, keywords):
    discussion_details = []

    for email in emails:
        # Extract the date of the email
        date_match = re.search(r'\nDate: (.+)\n', email)
        if date_match:
            email_date = date_match.group(1).strip()
            try:
                email_date = pd.to_datetime(email_date, utc=True)
            except ValueError:
                continue  # Skip email if date format is not recognized

            # Analyze sentiment of the email using BERT and DistilBERT - Truncation is handled within the pipelines
            bert_sentiment = bert_classifier(email)[0]['label']
            distilbert_sentiment = distilbert_classifier(email)[0]['label']

            # Check for mentions of important topics
            for keyword in keywords:
                if keyword in email.lower():
                    discussion_details.append({
                        'Date': email_date,
                        'Topic': keyword,
                        'BERT Sentiment': bert_sentiment,
                        'DistilBERT Sentiment': distilbert_sentiment
                    })

    return discussion_details

# Perform the extraction
discussion_details_new_mobile = extract_date_topic_sentiment_mobile(emails_new_mobile, important_keywords_mobile)

# Convert to DataFrame for further analysis
df_discussion_details_new_mobile = pd.DataFrame(discussion_details_new_mobile)

# Group the data by year and topic to analyze the discussion distribution
df_discussion_details_new_mobile['Year'] = df_discussion_details_new_mobile['Date'].dt.year
discussion_by_year_new_mobile = df_discussion_details_new_mobile.groupby(['Year', 'Topic']).size().unstack(fill_value=0)

# Overall sentiment analysis using BERT and DistilBERT
overall_sentiment_bert = df_discussion_details_new_mobile.groupby('Topic')['BERT Sentiment'].value_counts().unstack(fill_value=0)
overall_sentiment_distilbert = df_discussion_details_new_mobile.groupby('Topic')['DistilBERT Sentiment'].value_counts().unstack(fill_value=0)

# Combine the number of mentions and sentiment data for each topic
combined_data_mobile_bert = pd.concat([discussion_by_year_new_mobile.sum(axis=0), overall_sentiment_bert], axis=1).fillna(0)
combined_data_mobile_distilbert = pd.concat([discussion_by_year_new_mobile.sum(axis=0), overall_sentiment_distilbert], axis=1).fillna(0)

# Handle the column names dynamically based on the actual data structure
columns_bert = ['Total Mentions'] + list(overall_sentiment_bert.columns)
columns_distilbert = ['Total Mentions'] + list(overall_sentiment_distilbert.columns)
combined_data_mobile_bert.columns = columns_bert
combined_data_mobile_distilbert.columns = columns_distilbert

# Save the BERT results to a text file
output_file_bert = '/content/db_discussion_details_bert.txt'
with open(output_file_bert, 'w') as f_bert:
    f_bert.write("Combined Topic Analysis for DB Mailing List (BERT)\n")
    f_bert.write("===============================================================\n\n")

    f_bert.write("Overall Combined Data:\n")
    for topic, data in combined_data_mobile_bert.iterrows():
        f_bert.write(f"Topic: {topic}\n")
        for column in combined_data_mobile_bert.columns:
            f_bert.write(f"  {column}: {int(data[column])}\n")
        f_bert.write("\n")

print(f"BERT results saved to {output_file_bert}")

# Save the DistilBERT results to a text file
output_file_distilbert = '/content/db_discussion_details_distilbert.txt'
with open(output_file_distilbert, 'w') as f_distilbert:
    f_distilbert.write("Combined Topic Analysis for DB Mailing List (DistilBERT)\n")
    f_distilbert.write("======================================================================\n\n")

    f_distilbert.write("Overall Combined Data:\n")
    for topic, data in combined_data_mobile_distilbert.iterrows():
        f_distilbert.write(f"Topic: {topic}\n")
        for column in combined_data_mobile_distilbert.columns:
            f_distilbert.write(f"  {column}: {int(data[column])}\n")
        f_distilbert.write("\n")

print(f"DistilBERT results saved to {output_file_distilbert}")


BERT results saved to /content/db_discussion_details_bert.txt
DistilBERT results saved to /content/db_discussion_details_distilbert.txt


DOWNLOAD THE RESULT FILE

In [None]:
from google.colab import files
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>