This python notebook performs sentiment analysis using LLM Models from LLM Models.ipynb to generate sentiments at a multi-threaded level.

In [3]:
import pandas as pd
import torch
from transformers import AutoTokenizer, BertForSequenceClassification
import csv
import numpy as np
import glob
from multiprocessing.pool import ThreadPool

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# UDF for BERT-based sentiment classification
def sentimentWithBert(input_text):

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # index_of_largest = torch.argmax(predictions).item()
    # sentiments = ['positive', 'negative', 'neutral']
    
    # sentiment = sentiments[index_of_largest]
    
    return predictions.detach().numpy()[0].tolist()

In [5]:
files = glob.glob("us_2024_news_sampled/*.csv")
fileInfoObjectList = []
idx = 0
for file in files:
    fileName = file.split("/")[-1]
    fileName = fileName.split(".")[0]
    fileInfoObjectList.append((idx,fileName,file))
    idx+=1


fileInfoObjectList[1]

(1, 'sampled_file_20', 'us_2024_news_sampled/sampled_file_20.csv')

In [16]:
def generateSentiment(fileInfo):
    idx = fileInfo[0]
    fileName = fileInfo[1]
    fileDir = fileInfo[2]

    try:
        raw_df = pd.read_csv(fileDir,header=None,on_bad_lines='skip',low_memory=False)
        df = raw_df.copy()
        df.columns = ["DateTime","URL","Title","SharingImage","LangCode","DocTone","DomainCountryCode","Location","Lat","Lon","CountryCode","Adm1Code","Adm2Code","GeoType","ContextualText","the_geom","date"]
        df = df.dropna(subset=["ContextualText"])
        df["prediction"] = df.apply(lambda row : sentimentWithBert(row["ContextualText"]),axis=1)
        df["positive_fb"] = df.apply(lambda row : row["prediction"][0],axis=1)
        df["negative_fb"] = df.apply(lambda row : row["prediction"][1],axis=1)
        df["neutral_fb"] = df.apply(lambda row : row["prediction"][2],axis=1)
        df.to_csv(f"sentiment_sampled_processed/sampled_file_45_processed/{fileName}_processed.csv",index=False)
        failure_msg = "Succeeded"
        print(f"{idx} {fileName} Succeeded")
        
    except Exception as e:
        failure_msg = f"{idx} file: {fileDir} failed with error: {e}"
        print(failure_msg)
    
    return (idx,fileDir,failure_msg)

In [11]:
with ThreadPool(10) as pool:
    results = pool.map(generateSentiment, fileInfoObjectList)
    pool.close()
    pool.join()
    results_df = pd.DataFrame(data = results)

24 sampled_file_81 Succeeded
15 sampled_file_8 Succeeded
18 sampled_file_30 Succeeded
21 sampled_file_43 Succeeded
12 sampled_file_19 Succeeded
0 sampled_file_34 Succeeded
27 sampled_file_56 Succeeded
9 sampled_file_32 Succeeded
6 sampled_file_36 Succeeded
3 sampled_file_35 Succeeded
25 sampled_file_95 Succeeded
16 sampled_file_9 Succeeded
28 sampled_file_40 Succeeded
19 sampled_file_18 Succeeded
22 sampled_file_94 Succeeded
13 sampled_file_31 Succeeded
4 sampled_file_23 Succeeded
7 sampled_file_22 Succeeded
1 sampled_file_20 Succeeded
10 sampled_file_33 Succeeded
26 sampled_file_42 Succeeded
29 sampled_file_54 Succeeded
23 sampled_file_80 Succeeded
17 sampled_file_24 Succeeded
20 sampled_file_57 Succeeded
14 sampled_file_25 Succeeded
5 sampled_file_37 Succeeded
2 sampled_file_21 Succeeded
11 sampled_file_27 Succeeded
8 sampled_file_26 Succeeded
39 file: us_2024_news_sampled/sampled_file_45.csv failed with error: text input must be of type `str` (single example), `List[str]` (batch or 

## 2. Fixing incorrect sample file

In [6]:
fileInfoObjectList[39]

(39, 'sampled_file_45', 'us_2024_news_sampled/sampled_file_45.csv')

In [7]:
raw_df = pd.read_csv("us_2024_news_sampled/sampled_file_45.csv",header=None,on_bad_lines='skip',low_memory=False)
raw_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2024-05-14 12:30:00+08:00,https://www.mdjonline.com/state/georgia-prison...,Georgia prison inmate pleads guilty in meth co...,https://bloximages.newyork1.vip.townnews.com/m...,eng,-8.415842,US,"Georgia, United States",32.9866,-83.6487,US,USGA,,2.0,meth he is facing a minimum sentence of 15 yea...,POINT(-83.6487 32.9866),2024-05-14
1,2024-05-14 12:30:00+08:00,https://www.mdjonline.com/news/local/events-an...,Events and Happenings : The Week of May 13,https://bloximages.newyork1.vip.townnews.com/m...,eng,2.450593,US,"Mableton, Georgia, United States",33.8187,-84.5824,US,USGA,GA067,3.0,and tips on bicycles and a test run around the...,POINT(-84.5824 33.8187),2024-05-14
2,2024-05-14 12:30:00+08:00,https://www.latimes.com/environment/story/2024...,Is there bird flu in California wastewater ? -...,https://california-times-brightspot.s3.amazona...,eng,-2.624309,US,"San Diego, California, United States",32.7153,-117.1570,US,USCA,CA073,3.0,an out of season spike in influenza a flu viru...,POINT(-117.157 32.7153),2024-05-14
3,2024-05-14 12:30:00+08:00,https://www.rfglobalnet.com/doc/mobix-labs-ann...,Mobix Labs Announces Strategic Acquisition Of ...,,eng,1.923077,US,"Irvine, California, United States",33.6695,-117.8230,US,USCA,CA059,3.0,consideration consisted of approximately 2 mil...,POINT(-117.823 33.6695),2024-05-14
4,2024-05-14 12:30:00+08:00,https://www.rutlandherald.com/news/community/r...,Rutland Herald Community News,https://bloximages.chicago2.vip.townnews.com/r...,eng,1.944107,US,"Vermont, United States",44.0407,-72.7093,US,USVT,,2.0,recovery retreat in ludlow founded by phish tr...,POINT(-72.7093 44.0407),2024-05-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1963,2024-05-18 00:00:00+08:00,https://carsonnow.org/?page=8&search_theme_for...,Carson City Nevada News - Carson Now | Your on...,,eng,1.801201,US,"Carson City, Nevada, United States",39.1638,-119.7670,US,USNV,NV510,3.0,historical society hosts adam michalski who wi...,POINT(-119.767 39.1638),2024-05-18
1964,2024-05-18 00:00:00+08:00,http://www.elwakt.com/%20https:/www.elwakt.com...,【 demo slot pragmatic sweet bonanza xmas 】- to...,,eng,4.769231,YM,"Oklahoma, United States",35.5376,-96.9247,US,USOK,,2.0,gas costs battling health challenges living on...,POINT(-96.9247 35.5376),2024-05-18
1965,2024-05-18 00:00:00+08:00,https://nbc16.com/newsletter-daily/oregon-food...,Oregon Food Bank claims US House Farm bill wou...,https://nbc16.com/resources/media/3cfbcd41-b22...,eng,-1.438849,US,"Oregon, United States",44.5672,-122.1270,US,USOR,,2.0,on health dietary cost studies we know that hu...,POINT(-122.127 44.5672),2024-05-18
1966,2024-05-18 00:00:00+08:00,https://www.bignewsnetwork.com/news/274376606/...,Caitlin Clark show comes to Big Apple as Fever...,https://cdn.bignewsnetwork.com/flm1715960409.jpg,eng,0.705882,US,"Indiana, United States",39.8647,-86.2604,US,USIN,,2.0,the responsibility that comes with it im very ...,POINT(-86.2604 39.8647),2024-05-18


In [8]:
num_chunks = 8
# Calculate the number of rows per chunk
rows_per_chunk = len(raw_df) // num_chunks

# Loop through and create each chunk
for i in range(num_chunks):
    start_index = i * rows_per_chunk
    # For the last chunk, take the remainder of rows
    end_index = (i + 1) * rows_per_chunk if i < num_chunks - 1 else len(raw_df)
    
    chunk = raw_df.iloc[start_index:end_index]
    chunk.to_csv(f"us_2024_news_sampled/sampled_file_45/sampled_file_45_chunk_{i + 1}.csv", index=False)

In [36]:
chunk_files = glob.glob("us_2024_news_sampled/sampled_file_45/*.csv")
chunkFileInfoObjectList = []
idx = 0
for file in chunk_files:
    fileName = file.split("/")[-1]
    fileName = fileName.split(".")[0]
    chunkFileInfoObjectList.append((idx,fileName,file))
    idx+=1


In [17]:
with ThreadPool(10) as pool:
    results = pool.map(generateSentiment, chunkFileInfoObjectList)
    pool.close()
    pool.join()
    results_df = pd.DataFrame(data = results)

1 sampled_file_45_chunk_7 Succeeded
6 sampled_file_45_chunk_2 Succeeded
4 sampled_file_45_chunk_1 Succeeded
3 sampled_file_45_chunk_4 Succeeded
0 sampled_file_45_chunk_6 Succeeded
7 sampled_file_45_chunk_8 Succeeded
5 sampled_file_45_chunk_3 Succeeded
2 sampled_file_45_chunk_5 Succeeded


In [42]:
#generating sample_file_45:
chunk_processed_files = glob.glob("sentiment_sampled_processed/sampled_file_45_processed/*.csv")

sample_processed_file_45_df = pd.DataFrame()
for file_dir in chunk_processed_files:
    temp = pd.read_csv(file_dir,header=1)
    sample_processed_file_45_df = pd.concat([sample_processed_file_45_df,temp],axis=0)


In [47]:
sample_processed_file_45_df.columns = ['DateTime', 'URL', 'Title', 'SharingImage', 'LangCode', 'DocTone',
       'DomainCountryCode', 'Location', 'Lat', 'Lon', 'CountryCode',
       'Adm1Code', 'Adm2Code', 'GeoType', 'ContextualText', 'the_geom', 'date',
       'prediction', 'positive_fb', 'negative_fb', 'neutral_fb']

In [50]:
sample_processed_file_45_df.to_csv("sentiment_sampled_processed/sampled_file_45_processed.csv",index=False)

## 3. combining sentiment with pagerank

In [80]:
# combining all processed_sample_files into one df
processed_sample_files = glob.glob("sentiment_sampled_processed/*.csv")
combined_processed_df = pd.DataFrame()

for fileDir in processed_sample_files:
    temp = pd.read_csv(fileDir)
    combined_processed_df = pd.concat([combined_processed_df,temp],axis=0)

In [81]:
combined_processed_df["website"] = combined_processed_df["URL"].apply(lambda x:x.split("/")[2])
combined_processed_df["website"] = combined_processed_df["website"].apply(lambda x:x.split(":")[0])
sample_domain_list = list(set(list(combined_processed_df["website"])))
pd.DataFrame(sample_domain_list).to_csv("sample_domain_list.csv",header=False,index=False)

In [96]:
page_rank_df = pd.read_csv("sample_page_rank_domains.csv",index_col=0)
page_rank_df = page_rank_df[["website","page_rank_decimal"]]

page_rank_df


Unnamed: 0_level_0,website,page_rank_decimal
status_code,Unnamed: 1_level_1,Unnamed: 2_level_1
200,rightwingnews.com,4.91
200,www.swishappeal.com,4.89
200,www.tsln.com,5.11
200,1035kissfmboise.com,4.48
200,myradioplace.com,3.41
...,...,...
200,www.piquenewsmagazine.com,4.89
200,adventuremomblog.com,3.77
200,www.980waav.com,4.90
200,www.maqnews.com,2.94


In [105]:
combined_sentiment_page_rank_df[["DateTime","website","page_rank_decimal","Date_hourly","positive_fb","negative_fb","neutral_fb"]]

Unnamed: 0,DateTime,website,page_rank_decimal,Date_hourly,positive_fb,negative_fb,neutral_fb
0,2024-06-14 02:45:00+08:00,www.beckershospitalreview.com,5.72,2024061402,0.040248,0.022995,0.936758
1,2024-06-14 02:45:00+08:00,www.finsmes.com,5.48,2024061402,0.441696,0.009332,0.548972
2,2024-06-14 02:45:00+08:00,www.yahoo.com,7.63,2024061402,0.418814,0.028759,0.552427
3,2024-06-14 02:45:00+08:00,www.goshennews.com,5.16,2024061402,0.026196,0.674003,0.299801
4,2024-06-14 02:45:00+08:00,www.beckershospitalreview.com,5.72,2024061402,0.043051,0.021397,0.935552
...,...,...,...,...,...,...,...
196844,2024-04-02 09:00:00+08:00,www.ktep.org,4.88,2024040209,0.110620,0.033546,0.855833
196845,2024-04-02 09:00:00+08:00,963theblaze.com,4.78,2024040209,0.023493,0.032889,0.943618
196846,2024-04-02 09:30:00+08:00,www.yahoo.com,7.63,2024040209,0.072654,0.136838,0.790508
196847,2024-04-02 09:30:00+08:00,www.washingtonpost.com,8.18,2024040209,0.034147,0.814993,0.150860


In [97]:
combined_sentiment_page_rank_df = combined_processed_df.merge(page_rank_df,left_on="website",right_on="website",how="left")
combined_sentiment_page_rank_df.shape[0]

196849

In [98]:
combined_sentiment_page_rank_df['DateTime'] = pd.to_datetime(combined_sentiment_page_rank_df['DateTime'])
combined_sentiment_page_rank_df['Date_hourly'] = combined_sentiment_page_rank_df['DateTime'].dt.strftime('%Y%m%d%H')
combined_sentiment_page_rank_df.to_csv("final_sample_processed_with_pageRank.csv",index=False)

## 4. Aggregating

In [102]:
combined_sentiment_page_rank_df[["DateTime","ContextualText","website","positive_fb","negative_fb","neutral_fb"]]

Unnamed: 0,DateTime,ContextualText,website,positive_fb,negative_fb,neutral_fb
0,2024-06-14 02:45:00+08:00,and offer support that something were always l...,www.beckershospitalreview.com,0.040248,0.022995,0.936758
1,2024-06-14 02:45:00+08:00,data centers a denver co based global provider...,www.finsmes.com,0.441696,0.009332,0.548972
2,2024-06-14 02:45:00+08:00,up in groveport madison in fiscal year 2023 31...,www.yahoo.com,0.418814,0.028759,0.552427
3,2024-06-14 02:45:00+08:00,the felony arrest of an elkhart man who was al...,www.goshennews.com,0.026196,0.674003,0.299801
4,2024-06-14 02:45:00+08:00,for them and offer support that something were...,www.beckershospitalreview.com,0.043051,0.021397,0.935552
...,...,...,...,...,...,...
196844,2024-04-02 09:00:00+08:00,said he is both concerned by rhetoric of preju...,www.ktep.org,0.110620,0.033546,0.855833
196845,2024-04-02 09:00:00+08:00,as a typical home using the zillow home values...,963theblaze.com,0.023493,0.032889,0.943618
196846,2024-04-02 09:30:00+08:00,and speed bumps on missouri avenue to slow dow...,www.yahoo.com,0.072654,0.136838,0.790508
196847,2024-04-02 09:30:00+08:00,to start april 15 should be immediately sancti...,www.washingtonpost.com,0.034147,0.814993,0.150860
