In [98]:
import pandas as pd
import torch
from transformers import AutoTokenizer, BertForSequenceClassification
import csv
import numpy as np
import glob
from multiprocessing.pool import ThreadPool
import multiprocessing

## 1. Reading csv

In [90]:
# UDF for BERT-based sentiment classification
def sentimentWithBert(input_text):

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # index_of_largest = torch.argmax(predictions).item()
    # sentiments = ['positive', 'negative', 'neutral']
    
    # sentiment = sentiments[index_of_largest]
    
    return predictions.detach().numpy()[0].tolist()

In [91]:
files = glob.glob("us_2024_news.csv/*.csv")
fileInfoObjectList = []
idx = 0
for file in files:
    fileName = file.split("/")[-1]
    fileName = fileName.split(".")[0]
    fileInfoObjectList.append((idx,fileName,file))
    idx+=1


fileInfoObjectList[0]

(0,
 'part-00296-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000',
 'us_2024_news.csv/part-00296-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000.csv')

In [95]:
def generateSentiment(fileInfo):
    idx = fileInfo[0]
    fileName = fileInfo[1]
    fileDir = fileInfo[2]

    try:
        raw_df = pd.read_csv(fileDir,header=None,on_bad_lines='skip',low_memory=False)
        df = raw_df.copy().head(1000)
        df.columns = ["DateTime","URL","Title","SharingImage","LangCode","DocTone","DomainCountryCode","Location","Lat","Lon","CountryCode","Adm1Code","Adm2Code","GeoType","ContextualText","the_geom","date"]
        df["prediction"] = df.apply(lambda row : sentimentWithBert(row["ContextualText"]),axis=1)
        df["positive_fb"] = df.apply(lambda row : row["prediction"][0],axis=1)
        df["negative_fb"] = df.apply(lambda row : row["prediction"][1],axis=1)
        df["neutral_fb"] = df.apply(lambda row : row["prediction"][2],axis=1)
        df.to_csv(f"sentiment_processed/{fileName}_processed.csv",index=False)
        failure_msg = "Succeeded"
        print(f"{idx} {fileName} Succeeded")
        
    except Exception as e:
        failure_msg = f"{idx} file: {fileDir} failed with error: {e}"
        print(failure_msg)
    
    return (idx,fileDir,failure_msg)

In [96]:
generateSentiment(fileInfoObjectList[0])

0 part-00296-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded


(0,
 'us_2024_news.csv/part-00296-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000.csv',
 'Succeeded')

In [93]:
with ThreadPool(10) as pool:
    results = pool.map(generateSentiment, fileInfoObjectList)
    pool.close()
    pool.join()
    results_df = pd.DataFrame(data = results)

30 part-00738-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
40 part-00095-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
0 part-00296-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
10 part-00015-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
20 part-00957-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
35 part-00235-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
15 part-00925-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
5 part-00768-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
25 part-00717-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
41 part-00673-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
45 part-00772-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
31 part-00502-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
1 part-00246-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
36 part-00264-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
11 part-00750-57f7270d-f9cc-406c-8b8c-af8163fc3683-c000 Succeeded
21 part-00195

In [99]:
multiprocessing.cpu_count()

8

In [None]:
chunk_files = glob.glob("/Users/yining/Desktop/Big_Data_Project/sampling/us_2024_news_sampled/sampled_file_45/*.csv")
chunk_files

In [110]:
pd.read_csv(chunk_files[0]).shape[0]

246

In [None]:
chunk_files[0]