In [1]:
from pytube import YouTube, Channel
import whisper
import os
os.environ["IMAGEIO_FFMPEG_EXE"] = "/usr/bin/ffmpeg"
from moviepy.editor import AudioFileClip
from googleapiclient.discovery import build
from datetime import datetime, timedelta
from dotenv import load_dotenv
from transformers import AutoTokenizer 
from transformers import AutoModelForSequenceClassification
from transformers import pipeline  

In [2]:
from pytube import YouTube, Channel
import whisper
import os
os.environ["IMAGEIO_FFMPEG_EXE"] = "/usr/bin/ffmpeg"
from moviepy.editor import AudioFileClip
from googleapiclient.discovery import build
from datetime import datetime, timedelta
from dotenv import load_dotenv
from transformers import AutoTokenizer 
from transformers import AutoModelForSequenceClassification
from transformers import pipeline  

In [3]:
# the above libraries are for the other parts of the project as well. 
# This is how we get our function called final_df

import pandas as pd
import openai
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Initialize OpenAI API
openai.api_key = '' # use your api_key 

#  selected Huggingface models with their corresponding paths. We pick FinancialBert and ProsusAI/finbert
models = {
    "FinancialBERT": "yiyanghkust/finbert-tone",
    #"bert-base-uncased": "bert-base-uncased", we take this model out since it's a binary statement as opposed to
    # multiclass
    "ProsusAI/finbert": "ProsusAI/finbert"
}

def analyze_sentiment_gpt35turbo(text):
    """
    Analyze sentiment of a given text using GPT-3.5-turbo.

    Parameters:
    - text (str): The input text to analyze.

    Returns:
    - sentiment (str): The sentiment label of the text.
    """
    if not text:
        return "We need text"  

    messages = [
        {"role": "system", "content": "You are a sentiment analysis assistant."},
        {"role": "user", "content": f"Please analyze the sentiment of the following text and classify it as Positive, Negative, or Neutral:\n\n{text}"}
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    sentiment = response.choices[0].message['content'].strip()
    return sentiment

def analyze_sentiment_huggingface(text, model_path):
    """
    Analyze sentiment of a given text using a specified pre-trained model from Hugging Face.

    Parameters:
    - text (str): The input text to analyze.
    - model_path (str): The path to the pre-trained model.

    Returns:
    - label (str): The sentiment label of the text.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    
    result = sentiment_analyzer(text)[0]
    return result['label']

def analyze_csv_sentiments(input_csv, output_csv):
    """
    Analyze sentiments of text data in a CSV file using multiple pre-trained models and
    store the results in separate columns for each model.

    Parameters:
    - input_csv (str): Path to the input CSV file containing text data.
    - output_csv (str): Path to save the output CSV file with sentiment analysis results.

    Returns:
    - df (DataFrame): The updated DataFrame with sentiment analysis results.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Ensure there is a column to analyze
    if 'text' not in df.columns:
        raise ValueError("The CSV file must contain a 'text' column for analysis.")

    # Analyze sentiment for each row using each model and add results to new columns
    df['gpt-3.5-turbo_sentiment'] = df['text'].apply(analyze_sentiment_gpt35turbo)
    
    for model_name, model_path in models.items():
        df[f'{model_name}_sentiment'] = df['text'].apply(lambda text: analyze_sentiment_huggingface(text, model_path))

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    
    # Return the updated DataFrame
    return df


In [6]:
# This will take a while and this is how we generate our file with the above function
import warnings
warnings.filterwarnings("ignore")
input_csv = "test_file.csv"  # Path to your input CSV file
output_csv = "output_with_sentiments1.csv"  # Path to save the output CSV file with sentiments
result_df = analyze_csv_sentiments(input_csv, output_csv)
print(result_df.head())  # Print the first few rows of the updated DataFrame

In [1]:
# This is for simplifying the answer of gpt 3.5 turboso we only get the positive, neutral and negative from the 
# entire sentiment sentence. 

def find_sentiment_words(input_csv, output_csv, column_name):
    df = pd.read_csv(input_csv)
    
    if column_name not in df.columns:
        raise ValueError(f"The column {column_name} does not in the CSV file.")
        
    sentiment_words = ["Positive","Neutral","Negative"]
    
    df['chatgpt_sentiment'] = None
    
    for index,row in df.iterrows():
        text = row[column_name]
        
        for word in sentiment_words:
            if pd.notnull(text) and word.lower() in text.lower():
                df.at[index, 'chatgpt_sentiment'] = word
                break
    
    df.to_csv(output_csv, index = False)
    
    return df

In [2]:
input_csv = "output_with_sentiments1.csv"
output_csv = "updated_sentiments.csv"
column_name = "gpt-3.5-turbo_sentiment"


In [7]:
# updated version of our file with chatgpt_sentiment column added 
final_df = find_sentiment_words(input_csv,output_csv, column_name)
final_df

In [8]:
# Again just some processing on the columns like doing upper case so all match. 

        
replacements = {"positive":"Positive", 
                "neutral":"Neutral",
               "negative":"Negative"
    }
for old_word, new_word in replacements.items(): 
        final_df['FinancialBERT_sentiment'].replace(old_word, new_word, inplace=True)
        
        # print(final_df)        
replacements = {"positive":"Positive", 
                "neutral":"Neutral",
               "negative":"Negative"
    }
for old_word, new_word in replacements.items(): 
        final_df['ProsusAI/finbert_sentiment'].replace(old_word, new_word, inplace=True) 
        
       # print(final_df)

In [None]:
# (4846, 6) # kaggle row number is checked 
final_df.shape

In [9]:
# saving the file that we have the sentiment model results in, call it "test_models"

final_df.to_csv('test_models.csv')

### We now need to check these sentiment labels against the Kaggle dataframe: 

In [None]:
# Kaggle datafram is stored in test: 

test_file = "/Users/nazanin.komeilizadeh/Desktop/test.csv"
test = pd.read_csv(test_file)

In [None]:
replacements = {"positive":"Positive", 
                "neutral":"Neutral",
               "negative":"Negative"
    }
for old_word, new_word in replacements.items(): 
        test['kaggle_label'].replace(old_word, new_word, inplace=True)
        

In [None]:
test.kaggle_label.unique()

In [None]:
test = test.drop(['all_data'], axis = 1)

In [None]:
# save into a csv file 
test.to_csv('kaggle.csv')

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

# here, we want to check the sentiment analysis of each model against the Kaggle label 

# columns to compare
columns_final_df = ['FinancialBERT_sentiment','ProsusAI/finbert_sentiment', 'chatgpt_sentiment']
column_test = 'kaggle_label'

# making sure DataFrames have the same length for row-wise comparison
if len(final_df) != len(test):
    raise ValueError("DataFrames must have the same number of rows for comparison.")

# Create new columns in final_df to store the match results
for col in columns_final_df:
    final_df[f'{col}_match'] = final_df[col] == test[column_test]

In [None]:
final_df.to_csv('match.csv')

In [None]:
# slicing the datafram to only the last 3 comparision columns

accuracy = final_df[final_df.columns[-3:]]


In [None]:
accuracy.columns


In [None]:
accuracy['FinancialBERT_sentiment_match'].value_counts()


In [None]:
accuracy['ProsusAI/finbert_sentiment_match'].value_counts()


In [None]:
accuracy['chatgpt_sentiment_match'].value_counts()


In [10]:
accu_finbert = len(accuracy[accuracy['FinancialBERT_sentiment_match']==1])/len(accuracy)
# 0.79

In [11]:
# turns out it's the best model for accuracy
accu_prosus = len(accuracy[accuracy['ProsusAI/finbert_sentiment_match']==True])/len(accuracy)
# 0.8895

In [12]:
accu_chatgpt = len(accuracy[accuracy['chatgpt_sentiment_match']==True])/len(accuracy)
# 0.712

# END 