In [16]:
SCHOOL = "McGill"
data_path = f"bias_processing/data/1/{SCHOOL.lower()}_dataset.csv"
output_path = f"bias_processing/data/3/{SCHOOL.lower()}_dataset_summarizer.csv"

In [None]:
"""
Load in a csv from Sentiment_Dataset_Maker and add 4x3x3 columns
4 topics ("Israel", "Palestine", "India", "China")
3 hypotheses for sentiment (Positive, Negative, Neutral)
3 levels of granularity
'sentence':Compute sentiment for entire article
'paragraph':Summarizes each paragraph using an ML summarizing model, and join those summaries to one body of text. Compute sentiment for this new article version.
'article':Summarize the entire article in one go using the same ML model. Compute sentiment for this new article version
Save a new csv with these added columns

"""

In [None]:
%pip install transformers nltk textblob


In [None]:
### THIS CODE BLOCK GENERATES THE APPROPRIATE RESULTS FOR ONE 
### TEST CASE. THE RESULTS CAN BE FOUND IN `sample_mcgill_dataset_summarizer.csv`

import pandas as pd
from transformers import pipeline
from textblob import TextBlob
import numpy as np
import csv

# Text summarization model
summarizer = pipeline('summarization', model='t5-base')

# Sentiment calculation
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Paragraph-wise summarization and sentiment calculation
def summarize_paragraphs(text):
    paragraphs = text.split('\n')
    summarized_text = ' '.join([summarizer(para, max_length=min(50, len(para)))[0]['summary_text'] 
                                for para in paragraphs if para])
    return summarized_text, calculate_sentiment(summarized_text)

# Full text summarization and sentiment calculation
def summarize_full_text(text):
    summarized_text = summarizer(text, max_length=min(100, len(text)))[0]['summary_text']
    return summarized_text, calculate_sentiment(summarized_text)

# Load data
df = pd.read_csv("sample_mcgill_article.csv")
df = df.head(1)

# Open CSV file for writing
with open('sample_mcgill_dataset_summarizer.csv', 'w', newline='') as csvfile:
    fieldnames = ['Date', 'School', 'Keyword', 'Type', 'Full Article', 'Article Summary', 'Paragraph Summary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Loop through rows in DataFrame
    for _, row in df.iterrows():
        date = row['date']
        school = row['school']
        keyword = row['keyword']
        article = row['article']

        # Calculate sentiment for entire article
        article_sentiment = calculate_sentiment(article)

        # Summarize each paragraph, join summaries, and compute sentiment
        summary_paragraphs, summary_paragraphs_sentiment = summarize_paragraphs(article)

        # Summarize entire article in one go and compute sentiment
        summary_full_text, summary_full_text_sentiment = summarize_full_text(article)

        # Write sentiment results to CSV
        writer.writerow({'Date': date, 'School': school, 'Keyword': keyword, 
                         'Type': 'Positive', 
                         'Full Article': max(0, article_sentiment), 
                         'Article Summary': max(0, summary_full_text_sentiment), 
                         'Paragraph Summary': max(0, summary_paragraphs_sentiment)})
        writer.writerow({'Date': date, 'School': school, 'Keyword': keyword, 
                         'Type': 'Negative', 
                         'Full Article': min(0, article_sentiment), 
                         'Article Summary': min(0, summary_full_text_sentiment), 
                         'Paragraph Summary': min(0, summary_paragraphs_sentiment)})
        writer.writerow({'Date': date, 'School': school, 'Keyword': keyword, 
                         'Type': 'Neutral', 
                         'Full Article': 1 - abs(article_sentiment), 
                         'Article Summary': 1 - abs(summary_full_text_sentiment), 
                         'Paragraph Summary': 1 - abs(summary_paragraphs_sentiment)})


In [32]:
### THIS BLOCK SHOULD BE ALTERED SUCH THAT THE SUMMARIZER 
### PARSES THROUGH THE WHOLE DATASET. THIS BLOCK IS A WORK
### IN PROGRESS

import pandas as pd
from transformers import pipeline
from textblob import TextBlob
import numpy as np
import csv

SCHOOL = "McGill"
data_path = f"bias_processing/data/1/{SCHOOL.lower()}_dataset.csv"
output_path = f"bias_processing/data/3/{SCHOOL.lower()}_dataset_summarizer.csv"

# Text summarization model
summarizer = pipeline('summarization', model='t5-base')

# Sentiment calculation
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Summarize each paragraph, join summaries, and compute sentiment
def summarize_paragraphs(text):
    paragraphs = text.split('\n')
    summarized_text = ' '.join([summarizer(para, max_length=512, min_length=50)[0]['summary_text'] 
                                for para in paragraphs if para and len(para.split()) > 50])
    return summarized_text, calculate_sentiment(summarized_text)

# Full text summarization and sentiment calculation
def summarize_full_text(text):
    summarized_text = summarizer(text, max_length=512, min_length=50)[0]['summary_text']
    return summarized_text, calculate_sentiment(summarized_text)

# Load data
df = pd.read_csv(data_path)

# Open CSV file for writing
with open(output_path, 'w', newline='') as csvfile:
    fieldnames = ['Date', 'School', 'Keyword', 'Type', 'Full Article', 'Article Summary', 'Paragraph Summary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Loop through rows in DataFrame
    for _, row in df.iterrows():
        date = row['date']
        school = row['school']
        keyword = row['keyword']
        article = row['article']

        # Calculate sentiment for entire article
        article_sentiment = calculate_sentiment(article)

        # Summarize each paragraph, join summaries, and compute sentiment
        summary_paragraphs, summary_paragraphs_sentiment = summarize_paragraphs(article)

        # Summarize entire article in one go and compute sentiment
        summary_full_text, summary_full_text_sentiment = summarize_full_text(article)

        # Write sentiment results to CSV
        writer.writerow({'Date': date, 'School': school, 'Keyword': keyword, 
                         'Type': 'Positive', 
                         'Full Article': max(0, article_sentiment), 
                         'Article Summary': max(0, summary_full_text_sentiment), 
                         'Paragraph Summary': max(0, summary_paragraphs_sentiment)})
        writer.writerow({'Date': date, 'School': school, 'Keyword': keyword, 
                         'Type': 'Negative', 
                         'Full Article': min(0, article_sentiment), 
                         'Article Summary': min(0, summary_full_text_sentiment), 
                         'Paragraph Summary': min(0, summary_paragraphs_sentiment)})
        writer.writerow({'Date': date, 'School': school, 'Keyword': keyword, 
                         'Type': 'Neutral', 
                         'Full Article': 1 - abs(article_sentiment), 
                         'Article Summary': 1 - abs(summary_full_text_sentiment), 
                         'Paragraph Summary': 1 - abs(summary_paragraphs_sentiment)})

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Your max_length is set to 512, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your max_length is set to 512, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 512, but your input_length is only 90. Since this is a summarization task