In [5]:
# Define SCHOOL, data_path, and output_path
SCHOOL = "LIU"
model = "nltk_sia" # models can be "nltk_sia" or "bart"
data_path = f"{SCHOOL.lower()}_dataset.csv"
output_path = f"{SCHOOL.lower()}_dataset_granularity.csv"

In [2]:
"""
Load in a csv from Sentiment_Dataset_Maker and add 4x3x3 columns
4 topics ("Israel", "Palestine", "India", "China")
3 hypotheses for sentiment (Positive, Negative, Neutral)
3 levels of granularity
Compute sentiment for entire article
Compute sentiment for each paragraph in an article, take maximum for Positive and Negative, average for neutral
Make 'average' and 'max' options to a function call so we can change if need be
Compute sentiment for each sentence in an article, similar to paragraph approach
Save a new csv with these added columns
"""

'\nLoad in a csv from Sentiment_Dataset_Maker and add 4x3x3 columns\n4 topics ("Israel", "Palestine", "India", "China")\n3 hypotheses for sentiment (Positive, Negative, Neutral)\n3 levels of granularity\nCompute sentiment for entire article\nCompute sentiment for each paragraph in an article, take maximum for Positive and Negative, average for neutral\nMake \'average\' and \'max\' options to a function call so we can change if need be\nCompute sentiment for each sentence in an article, similar to paragraph approach\nSave a new csv with these added columns\n'

In [3]:
%pip install nltk pandas

%pip install transformers


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.9 MB/s[0m eta [36m0:00:

In [6]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
nltk.download('vader_lexicon')
nltk.download('punkt')

# Instantiate the sentiment analyzer

# Function to return the sentiment of a text
def get_sentiment(text, granularity, keyword, model=model, method='avg'):
    if model == "nltk_sia":
        sia = SentimentIntensityAnalyzer()
        # Output is a dict containing {'neg','pos','neu','composition'}. First three are needed for all future functionality
        def get_model_scores(text):
            scores = sia.polarity_scores(text)
            return scores
        def get_keys(text):
            return sia.polarity_scores(text).keys()

    elif model == "bart":
        bart_analyzer = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        def get_model_scores(text):
            labels = ['positive', 'negative', 'neutral']
            scores = bart_analyzer(text, labels)['scores']
            scores = {'pos': scores[0], 'neg': scores[1], 'neu': scores[2]}
            return scores
        def get_keys(text):
            return ['pos', 'neg', 'neu']

    if granularity == 'article':
        scores = get_model_scores(text)
        return scores['neg'], scores['pos'], scores['neu']
    elif granularity in ['paragraph', 'sentence']:
        if granularity == 'paragraph':
            # Calculate the polarity scores for each paragraph and store them in a list
            # TODO: Revise and check paragraph splitting, may have issues with article splitting
            listed_scores = [get_model_scores(paragraph) for paragraph in text.split('\n') if paragraph]
        elif granularity == 'sentence':
            listed_scores = [get_model_scores(sentence) for sentence in nltk.sent_tokenize(text)]

        # Transpose the list of dictionaries to separate the values for each key
        transposed_scores = list(zip(*[d.values() for d in listed_scores]))

        # Find the maximum value for each key using the max function
        ranked_scores = [max(scores) for scores in transposed_scores]

        # Create a dictionary with the corresponding keys and maximum values
        result_dict = dict(zip(get_keys(text), ranked_scores))

        return result_dict['neg'], result_dict['pos'], result_dict['neu']

# Load the csv
df = pd.read_csv(data_path)

# Process sentiment analysis
for granularity in ['article', 'paragraph', 'sentence']:
    print(granularity)
    df[f'{granularity}_neg'], df[f'{granularity}_pos'], df[f'{granularity}_neu'] = zip(
        *df.apply(lambda row: get_sentiment(row['article'], granularity, row['keyword'], model=model), axis=1)
    )

# Save the output DataFrame into a new CSV file
df.to_csv(output_path, index=False)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


article
paragraph
sentence
