In [4]:
SCHOOL = "USC"
data_path = f"../bias_processing/data/1/{SCHOOL.lower()}_dataset.csv"
output_path = f"../bias_processing/data/3/{SCHOOL.lower()}_dataset_summarizer.csv"
model = "nltk_sia"

In [None]:
"""
Load in a csv from Sentiment_Dataset_Maker and add 4x3x3 columns
4 topics ("Israel", "Palestine", "India", "China")
3 hypotheses for sentiment (Positive, Negative, Neutral)
3 levels of granularity
'sentence':Compute sentiment for entire article
'paragraph':Summarizes each paragraph using an ML summarizing model, and join those summaries to one body of text. Compute sentiment for this new article version.
'article':Summarize the entire article in one go using the same ML model. Compute sentiment for this new article version
Save a new csv with these added columns

"""

In [None]:
%pip install transformers nltk

In [5]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adaml\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
import pandas as pd
import csv
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from statistics import mean
from transformers import pipeline

summarizer = pipeline('summarization', model='t5-base')

# Summarize each paragraph, join summaries, and compute sentiment
def summarize_paragraphs(text):
    paragraphs = text.split('\n')
    summarized_text = '\n'.join([summarizer(para, max_length=int(len(para.split())/2) - 1, min_length=15)[0]['summary_text']
                                for para in paragraphs if para and len(para.split()) > 40])
    return summarized_text

# Full text summarization and sentiment calculation
def summarize_full_text(text):
    summarized_text = summarizer(text, max_length=512, min_length=50)[0]['summary_text']
    return summarized_text

# Function to return the sentiment of a text
def get_sentiment(text, granularity, keyword, model=model, method='avg'):
    if model == "nltk_sia":
        # Instantiate the sentiment analyzer
        sia = SentimentIntensityAnalyzer()
        # Output is a dict containing {'neg','pos','neu','composition'}. First three are needed for all future functionality
        def get_model_scores(text):
            scores = sia.polarity_scores(text)
            return scores
        def get_keys(text):
            return sia.polarity_scores(text).keys()

    if granularity in ['paragraph','article']:
        if granularity == 'paragraph':
            # Calculate the polarity scores for each paragraph and store them in a list
            # TODO: Revise and check paragraph splitting, may have issues with article splitting
            text = summarize_paragraphs(text)
        elif granularity == 'article':
            text = summarize_full_text(text)

    scores = get_model_scores(text)
    return scores['neg'], scores['pos'], scores['neu']

  from .autonotebook import tqdm as notebook_tqdm
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
from sentiment_calculater import build_csv

build_csv(data_path, output_path, get_sentiment)

Article: May 18, 2023


Cristina Foglie... | Keyword: China | Granularity: article | Neg: 0.051 | Pos: 0.0 | Neu: 0.949
Article: May 18, 2023


Cristina Foglie... | Keyword: China | Granularity: paragraph | Neg: 0.034 | Pos: 0.071 | Neu: 0.895
Article: May 18, 2023


Cristina Foglie... | Keyword: China | Granularity: sentence | Neg: 0.013 | Pos: 0.101 | Neu: 0.887
Article: May 18, 2023


Erin Mei
Staff ... | Keyword: China | Granularity: article | Neg: 0.143 | Pos: 0.058 | Neu: 0.799
Article: May 18, 2023


Erin Mei
Staff ... | Keyword: China | Granularity: paragraph | Neg: 0.046 | Pos: 0.082 | Neu: 0.872
Article: May 18, 2023


Erin Mei
Staff ... | Keyword: China | Granularity: sentence | Neg: 0.042 | Pos: 0.123 | Neu: 0.835
Article: May 18, 2023


Amanda Bernocco... | Keyword: China | Granularity: article | Neg: 0.0 | Pos: 0.048 | Neu: 0.952
Article: May 18, 2023


Amanda Bernocco... | Keyword: China | Granularity: paragraph | Neg: 0.069 | Pos: 0.0 | Neu: 0.931
Article: May 18, 2023


Your max_length is set to 512, but your input_length is only 369. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=184)


Article: May 18, 2023

By Zach Taber, S... | Keyword: India | Granularity: paragraph | Neg: 0.0 | Pos: 0.099 | Neu: 0.901
Article: May 18, 2023

By Zach Taber, S... | Keyword: India | Granularity: sentence | Neg: 0.01 | Pos: 0.122 | Neu: 0.867
Article: May 18, 2023

Last updated on ... | Keyword: India | Granularity: article | Neg: 0.089 | Pos: 0.141 | Neu: 0.769
Article: May 18, 2023

Last updated on ... | Keyword: India | Granularity: paragraph | Neg: 0.036 | Pos: 0.115 | Neu: 0.849
Article: May 18, 2023

Last updated on ... | Keyword: India | Granularity: sentence | Neg: 0.062 | Pos: 0.142 | Neu: 0.797
Article: May 18, 2023

By Shelley Dean,... | Keyword: India | Granularity: article | Neg: 0.021 | Pos: 0.083 | Neu: 0.896
Article: May 18, 2023

By Shelley Dean,... | Keyword: India | Granularity: paragraph | Neg: 0.0 | Pos: 0.21 | Neu: 0.79
Article: May 18, 2023

By Shelley Dean,... | Keyword: India | Granularity: sentence | Neg: 0.005 | Pos: 0.17 | Neu: 0.824
Article: May 18, 2023



Your max_length is set to 512, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


Article: May 18, 2023

By Alex Espinosa... | Keyword: Israel | Granularity: paragraph | Neg: 0.087 | Pos: 0.185 | Neu: 0.728
Article: May 18, 2023

By Alex Espinosa... | Keyword: Israel | Granularity: sentence | Neg: 0.134 | Pos: 0.219 | Neu: 0.647
Article: May 18, 2023

    
Published i... | Keyword: Israel | Granularity: article | Neg: 0.0 | Pos: 0.0 | Neu: 1.0
Article: May 18, 2023

    
Published i... | Keyword: Israel | Granularity: paragraph | Neg: 0.0 | Pos: 0.0 | Neu: 0.0
Article: May 18, 2023

    
Published i... | Keyword: Israel | Granularity: sentence | Neg: 0.0 | Pos: 0.0 | Neu: 1.0
Article: May 18, 2023

By Margaret Pepe... | Keyword: Palestine | Granularity: article | Neg: 0.0 | Pos: 0.048 | Neu: 0.952
Article: May 18, 2023

By Margaret Pepe... | Keyword: Palestine | Granularity: paragraph | Neg: 0.0 | Pos: 0.042 | Neu: 0.958
Article: May 18, 2023

By Margaret Pepe... | Keyword: Palestine | Granularity: sentence | Neg: 0.01 | Pos: 0.065 | Neu: 0.924
