In [None]:
SCHOOL = "USF"
data_path = f"../bias_processing/data/1/{SCHOOL.lower()}_dataset.poli.csv"
output_path = f"../bias_processing/data/3/{SCHOOL.lower()}_dataset_summarizer.poli.csv"
model = "nltk_sia"

In [None]:
"""
Load in a csv from Sentiment_Dataset_Maker and add 4x3x3 columns
4 topics ("Israel", "Palestine", "India", "China")
3 hypotheses for sentiment (Positive, Negative, Neutral)
3 levels of granularity
'sentence':Compute sentiment for entire article
'paragraph':Summarizes each paragraph using an ML summarizing model, and join those summaries to one body of text. Compute sentiment for this new article version.
'article':Summarize the entire article in one go using the same ML model. Compute sentiment for this new article version
Save a new csv with these added columns

"""

In [None]:
%pip install transformers nltk

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
import pandas as pd
import csv
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from statistics import mean
from transformers import pipeline

summarizer = pipeline('summarization', model='t5-base')

# Summarize each paragraph, join summaries, and compute sentiment
def summarize_paragraphs(text):
    paragraphs = text.split('\n')
    summarized_text = '\n'.join([summarizer(para, max_length=int(len(para.split())/2) - 1, min_length=15)[0]['summary_text']
                                for para in paragraphs if para and len(para.split()) > 40])
    return summarized_text

# Full text summarization and sentiment calculation
def summarize_full_text(text):
    summarized_text = summarizer(text, max_length=512, min_length=50)[0]['summary_text']
    return summarized_text

# Function to return the sentiment of a text
def get_sentiment(text, granularity, keyword, model=model, method='avg'):
    if model == "nltk_sia":
        # Instantiate the sentiment analyzer
        sia = SentimentIntensityAnalyzer()
        # Output is a dict containing {'neg','pos','neu','composition'}. First three are needed for all future functionality
        def get_model_scores(text):
            scores = sia.polarity_scores(text)
            return scores
        def get_keys(text):
            return sia.polarity_scores(text).keys()

    if granularity in ['paragraph','article']:
        if granularity == 'paragraph':
            # Calculate the polarity scores for each paragraph and store them in a list
            # TODO: Revise and check paragraph splitting, may have issues with article splitting
            text = summarize_paragraphs(text)
        elif granularity == 'article':
            text = summarize_full_text(text)

    scores = get_model_scores(text)
    return scores['neg'], scores['pos'], scores['neu']

In [None]:
from sentiment_calculater import build_csv

build_csv(data_path, output_path, get_sentiment)