This python notebook performs sentiment analysis using LLM Models from LLM Models.ipynb to generate sentiments at a multi-threaded level.

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, BertForSequenceClassification
import csv
import numpy as np
import glob
from multiprocessing.pool import ThreadPool

In [2]:
# UDF for BERT-based sentiment classification
def sentimentWithBert(input_text):

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # index_of_largest = torch.argmax(predictions).item()
    # sentiments = ['positive', 'negative', 'neutral']
    
    # sentiment = sentiments[index_of_largest]
    
    return predictions.detach().numpy()[0].tolist()

In [4]:
files = glob.glob("us_2024_news_sampled/*.csv")
fileInfoObjectList = []
idx = 0
for file in files:
    fileName = file.split("/")[-1]
    fileName = fileName.split(".")[0]
    fileInfoObjectList.append((idx,fileName,file))
    idx+=1


fileInfoObjectList[1]

(1, 'sampled_file_20', 'us_2024_news_sampled/sampled_file_20.csv')

In [10]:
def generateSentiment(fileInfo):
    idx = fileInfo[0]
    fileName = fileInfo[1]
    fileDir = fileInfo[2]

    try:
        raw_df = pd.read_csv(fileDir,header=None,on_bad_lines='skip',low_memory=False)
        df = raw_df.copy()
        df.columns = ["DateTime","URL","Title","SharingImage","LangCode","DocTone","DomainCountryCode","Location","Lat","Lon","CountryCode","Adm1Code","Adm2Code","GeoType","ContextualText","the_geom","date"]
        df["prediction"] = df.apply(lambda row : sentimentWithBert(row["ContextualText"]),axis=1)
        df["positive_fb"] = df.apply(lambda row : row["prediction"][0],axis=1)
        df["negative_fb"] = df.apply(lambda row : row["prediction"][1],axis=1)
        df["neutral_fb"] = df.apply(lambda row : row["prediction"][2],axis=1)
        df.to_csv(f"sentiment_sampled_processed/{fileName}_processed.csv",index=False)
        failure_msg = "Succeeded"
        print(f"{idx} {fileName} Succeeded")
        
    except Exception as e:
        failure_msg = f"{idx} file: {fileDir} failed with error: {e}"
        print(failure_msg)
    
    return (idx,fileDir,failure_msg)

In [11]:
with ThreadPool(10) as pool:
    results = pool.map(generateSentiment, fileInfoObjectList)
    pool.close()
    pool.join()
    results_df = pd.DataFrame(data = results)

24 sampled_file_81 Succeeded
15 sampled_file_8 Succeeded
18 sampled_file_30 Succeeded
21 sampled_file_43 Succeeded
12 sampled_file_19 Succeeded
0 sampled_file_34 Succeeded
27 sampled_file_56 Succeeded
9 sampled_file_32 Succeeded
6 sampled_file_36 Succeeded
3 sampled_file_35 Succeeded
25 sampled_file_95 Succeeded
16 sampled_file_9 Succeeded
28 sampled_file_40 Succeeded
19 sampled_file_18 Succeeded
22 sampled_file_94 Succeeded
13 sampled_file_31 Succeeded
4 sampled_file_23 Succeeded
7 sampled_file_22 Succeeded
1 sampled_file_20 Succeeded
10 sampled_file_33 Succeeded
26 sampled_file_42 Succeeded
29 sampled_file_54 Succeeded
23 sampled_file_80 Succeeded
17 sampled_file_24 Succeeded
20 sampled_file_57 Succeeded
14 sampled_file_25 Succeeded
5 sampled_file_37 Succeeded
2 sampled_file_21 Succeeded
11 sampled_file_27 Succeeded
8 sampled_file_26 Succeeded
39 file: us_2024_news_sampled/sampled_file_45.csv failed with error: text input must be of type `str` (single example), `List[str]` (batch or 

In [25]:
fileInfoObjectList[39]

(39, 'sampled_file_45', 'us_2024_news_sampled/sampled_file_45.csv')

In [26]:
generateSentiment(fileInfoObjectList[39])

39 file: us_2024_news_sampled/sampled_file_45.csv failed with error: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


(39,
 'us_2024_news_sampled/sampled_file_45.csv',
 '39 file: us_2024_news_sampled/sampled_file_45.csv failed with error: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).')