In [15]:
"""TEST BLOCK"""

from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    return_all_scores=True,
)
classifier(
    "Make ready to slaughter his sons for the guilt of their fathers; Lest they rise and posses the earth, and fill the breadth of the world with tyrants."
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[{'label': 'negative', 'score': 0.8345967531204224},
  {'label': 'neutral', 'score': 0.1521468460559845},
  {'label': 'positive', 'score': 0.013256409205496311}]]

In [1]:
import csv
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    return_all_scores=False,
)


def label_to_sentiment(label):
    """based on the key from the manual annotations"""
    if label == "neutral":
        return 1
    elif label == "positive":
        return 2
    elif label == "negative":
        return 3
    else:
        return None


def process(row):
    text = row["text"]
    result = classifier(text)
    sentiment = result[0]["label"]
    confidence = result[0]["score"]
    sentiment_value = label_to_sentiment(sentiment)
    return {
        "chunk": row["chunk"],
        "start_citation": row["start_citation"],
        "text": text,
        "sentiment": sentiment_value,
        "confidence": confidence,
    }


# def annotate(infile):
# '''testing function on one file instead of a dictionary'''
#     outfile = f"{infile[:-4]}_out.csv"  # output files are named "<infile>_out.csv"
#     with open(infile, mode='r', newline='', encoding='utf-8') as file:
#         reader = csv.DictReader(file)
#         with open(outfile, mode='w', newline='', encoding='utf-8') as out_csv:
#             fields = ['chunk', 'start_citation', 'text', 'sentiment', 'confidence']
#             writer = csv.DictWriter(out_csv, fieldnames=fields)
#             writer.writeheader()
#             for row in reader:
#                 writer.writerow(process(row))

#     print(f"Sentiment analysis completed for {infile}. Results exported to {outfile}")


def annotate(bible_paths):
    for bible, infile in bible_paths.items():
        outfile = f"{bible}_out.csv"  # output files are named "<bible>_out.csv"
        with open(infile, mode="r", newline="", encoding="utf-8") as file:
            reader = csv.DictReader(file)
            with open(outfile, mode="w", newline="", encoding="utf-8") as out_csv:
                fields = ["chunk", "start_citation", "text", "sentiment", "confidence"]
                writer = csv.DictWriter(out_csv, fieldnames=fields)
                writer.writeheader()
                for row in reader:
                    try:
                        writer.writerow(process(row))
                    except RuntimeError as e:
                        print(f"RuntimeError occurred while processing row: {row}")
                        print(f"Error message: {e}")
                        # if we hit this block, just set sentiment score and confidence to 0
                        writer.writerow(
                            {
                                "chunk": row["chunk"],
                                "start_citation": row["start_citation"],
                                "text": row["text"],
                                "sentiment": 0,
                                "confidence": 0.0,
                            }
                        )

        print(f"Sentiment analysis completed for {infile}. Results exported to {outfile}")

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



In [2]:
# uncomment paths to run the model
bible_paths = {
    # "asv": "../data/bibles_chunked/asv.csv",
    # "fbv": "../data/bibles_chunked/fbv.csv",
    # "web": "../data/bibles_chunked/web.csv", # looks like chunk 7909 might be causing a runtime error bc its too big, need to fix
    # "wmb": "../data/bibles_chunked/wmb.csv",
    # "kjv": "../data/bibles_chunked/kjv.csv"
}

annotate(bible_paths)

Sentiment analysis completed for ../data/bibles_chunked/kjv.csv. Results exported to kjv_out.csv
