In [16]:
SCHOOL = "McGill"
data_path = f"bias_processing/data/1/{SCHOOL.lower()}_dataset.csv"
output_path = f"bias_processing/data/3/{SCHOOL.lower()}_dataset_summarizer.csv"
model = "nltk_sia"

In [None]:
"""
Load in a csv from Sentiment_Dataset_Maker and add 4x3x3 columns
4 topics ("Israel", "Palestine", "India", "China")
3 hypotheses for sentiment (Positive, Negative, Neutral)
3 levels of granularity
'sentence':Compute sentiment for entire article
'paragraph':Summarizes each paragraph using an ML summarizing model, and join those summaries to one body of text. Compute sentiment for this new article version.
'article':Summarize the entire article in one go using the same ML model. Compute sentiment for this new article version
Save a new csv with these added columns

"""

In [None]:
%pip install transformers nltk textblob


In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
import pandas as pd
import csv
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from statistics import mean
from transformers import pipeline
from textblob import TextBlob

summarizer = pipeline('summarization', model='t5-base')

# Summarize each paragraph, join summaries, and compute sentiment
def summarize_paragraphs(text):
    paragraphs = text.split('\n')
    summarized_text = '\n'.join([summarizer(para, max_length=int(len(para.split())/2) - 1, min_length=15)[0]['summary_text']
                                for para in paragraphs if para and len(para.split()) > 40])
    return summarized_text

# Full text summarization and sentiment calculation
def summarize_full_text(text):
    summarized_text = summarizer(text, max_length=512, min_length=50)[0]['summary_text']
    return summarized_text


# Function to return the sentiment of a text
def get_sentiment(text, granularity, keyword, model=model, method='avg'):
    if model == "nltk_sia":
        # Instantiate the sentiment analyzer
        sia = SentimentIntensityAnalyzer()
        # Output is a dict containing {'neg','pos','neu','composition'}. First three are needed for all future functionality
        def get_model_scores(text):
            scores = sia.polarity_scores(text)
            return scores
        def get_keys(text):
            return sia.polarity_scores(text).keys()

    if granularity in ['paragraph','article']:
        if granularity == 'paragraph':
            # Calculate the polarity scores for each paragraph and store them in a list
            # TODO: Revise and check paragraph splitting, may have issues with article splitting
            text = summarize_paragraphs(text)
        elif granularity == 'article':
            text = summarize_full_text(text)

    scores = get_model_scores(text)
    return scores['neg'], scores['pos'], scores['neu']

# Load the csv
df = pd.read_csv(data_path)

granularities = ['article','paragraph','sentence']

output_file_exists = os.path.exists(output_path)

if output_file_exists:
    # If it exists, read the output file and create a list of existing dates
    with open(output_path, 'r', newline='\n') as existing_file:
        existing_dates = set()
        reader = csv.reader(existing_file)
        next(reader)  # Skip the header row
        for row in reader:
            existing_dates.add(row[0])  # Assuming the date is in the first column

    # Filter the DataFrame to skip rows with existing dates
    df = df[~df['date'].isin(existing_dates)]


# Open the output CSV file for writing
with open(output_path, 'a', newline='\n', encoding='utf-8') as output_file:
    csv_writer = csv.writer(output_file)

    # Write the header row
    if not output_file_exists:
      header = list(df.columns)
      for granularity in granularities:
          header.extend([f'{granularity}_neg', f'{granularity}_pos', f'{granularity}_neu'])
      csv_writer.writerow(header)

    # Process sentiment analysis and write to the CSV file line by line
    for index, row in df.iterrows():
        article = row['article']
        keyword = row['keyword']

        output_row = [row['date'], row['school'], keyword, article]
        # Perform sentiment analysis for each granularity
        for granularity in granularities:
            neg, pos, neu = get_sentiment(article, granularity, keyword)

            # Prepare the row to write to the CSV file
            output_row.extend([neg, pos, neu])

            # Print the line with sentiment analysis results
            print(f"Article: {article[:30]}... | Keyword: {keyword} | Granularity: {granularity} | Neg: {neg} | Pos: {pos} | Neu: {neu}")

        # Write the row to the CSV file
        csv_writer.writerow(output_row)

# The output CSV file will be automatically closed after the 'with' block

In [1]:
### THIS BLOCK SHOULD BE ALTERED SUCH THAT THE SUMMARIZER 
### PARSES THROUGH THE WHOLE DATASET. THIS BLOCK IS A WORK
### IN PROGRESS


import pandas as pd
from transformers import pipeline
from textblob import TextBlob
import numpy as np
import csv
import concurrent.futures
import os
import time

SCHOOL = "McGill"
data_path = f"bias_processing/data/1/{SCHOOL.lower()}_dataset.csv"
output_path = f"bias_processing/data/3/{SCHOOL.lower()}_dataset_summarizer.csv"

print('Starting the script...')

# Load model once
print('Loading summarization model...')
summarizer = pipeline('summarization', model='t5-base', device=-1)  # Use CPU
print('Summarization model loaded.')

# Sentiment calculation
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Function to process a single row
def process_row(summarizer, row):
    try:
        start_time = time.time()
        
        date = row['date']
        school = row['school']
        keyword = row['keyword']
        article = row['article']

        # Calculate sentiment for the entire article
        print('Calculating sentiment for the entire article...')
        article_sentiment = calculate_sentiment(article)

        # Summarize each paragraph, join summaries, and compute sentiment
        print('Summarizing each paragraph...')
        paragraphs = article.split('\n')
        summarized_text = ' '.join([summarizer(para, max_length=512, min_length=50)[0]['summary_text']
                                    for para in paragraphs if para and len(para.split()) > 50])
        summary_paragraphs_sentiment = calculate_sentiment(summarized_text)

        # Summarize the entire article in one go and compute sentiment
        print('Summarizing the entire article...')
        summarized_text = summarizer(article, max_length=512, min_length=50)[0]['summary_text']
        summary_full_text_sentiment = calculate_sentiment(summarized_text)

        # Print that the article has been processed
        print(f'Article processed for {date} - {school} - {keyword}')
        print(f'Time taken: {time.time() - start_time} seconds')

        return {
            'Date': date, 'School': school, 'Keyword': keyword,
            'Type': 'Positive',
            'Full Article': max(0, article_sentiment),
            'Article Summary': max(0, summary_full_text_sentiment),
            'Paragraph Summary': max(0, summary_paragraphs_sentiment)
        }, {
            'Date': date, 'School': school, 'Keyword': keyword,
            'Type': 'Negative',
            'Full Article': min(0, article_sentiment),
            'Article Summary': min(0, summary_full_text_sentiment),
            'Paragraph Summary': min(0, summary_paragraphs_sentiment)
        }, {
            'Date': date, 'School': school, 'Keyword': keyword,
            'Type': 'Neutral',
            'Full Article': 1 - abs(article_sentiment),
            'Article Summary': 1 - abs(summary_full_text_sentiment),
            'Paragraph Summary': 1 - abs(summary_paragraphs_sentiment)
        }
    except Exception as e:
        print(f'Error processing row: {e}')
        return None

# Load data
print('Loading data...')
df = pd.read_csv(data_path)
print('Data loaded.')

# Limit analysis to the first 20 rows
df = df.head(20)

# Open CSV file for writing
print('Opening CSV file for writing...')
with open(output_path, 'w', newline='') as csvfile:
    fieldnames = ['Date', 'School', 'Keyword', 'Type', 'Full Article', 'Article Summary', 'Paragraph Summary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    print('CSV file opened for writing.')

    # Process rows in parallel
    print('Processing rows in parallel...')
    with concurrent.futures.ProcessPoolExecutor(os.cpu_count()) as executor:
        futures = [executor.submit(process_row, summarizer, row) for _, row in df.iterrows()]
        for future in concurrent.futures.as_completed(futures):
            print('Processing completed row...')
            result = future.result()
            if result is not None:
                positive, negative, neutral = result
                writer.writerow(positive)
                writer.writerow(negative)
                writer.writerow(neutral)

print('Script completed.')

  from .autonotebook import tqdm as notebook_tqdm


Starting the script...
Loading summarization model...


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Summarization model loaded.
Loading data...
Data loaded.
Opening CSV file for writing...
CSV file opened for writing.
Processing rows in parallel...
Processing completed row...
