In [2]:
import json
import tiktoken     # token counting
import numpy as np 
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
from openai_finetune_tools import OpenAIFineTuneTools 
import pandas as pd
import numpy as np
import openai
import os 
import random
import math
import keyring
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# Data Loading

In [3]:
# Kaggle sentiment analysis for finance
dataset_path = '../data/FinancialPhraseBank/all-data.csv'
df = pd.read_csv(dataset_path, engine='python', encoding='ISO-8859-1')
df

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


## Insert prompt

In [4]:
prompt_sys = """You are an sentiment analyzer specialized in classifying sentiment of short financial texts.
Your task is to analyze the sentiment of the provided financial text and convert it into string format. Never include any other information or strings but output formt.

Follow these steps and respond only in the specified output format:

# Step 1: Read the provided financial text carefully.

# Step 2: Assign a sentiment score between 0 and 1 based on financial perspective.

# Step 3: Do a sentimental analysis and classify it into positive, negative or neutral category and get the reason why in the financial perspective.

# Step 4: Convert the classification into the specified output format.

#### output format:
<sentimental analysis>

### Example
# Text : The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported 
# Output : negative
# Text : Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
# Output : neutral
# Text : 'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .'
# Output : positive
# Text : Rinkuskiai 's beer sales fell by 6.5 per cent to 4.16 million litres , while Kauno Alus ' beer sales jumped by 6.9 per cent to 2.48 million litres.
# Output : neutral
"""
prompt_user = f"What is the sentiment of this sentence? {df.iloc[0][1]}"
prompt_assistant = f"{df.iloc[0][0]}"
# data format : dictionary
# data_format = f"""{{"messages": [{{"role": "system", "content": "{prompt_sys}"}}, {{"role": "user", "content": "{prompt_user}"}}, {{"role": "assistant", "content": "{prompt_assistant}"}}]}}"""
data_format = {
    "messages": [
        {"role": "system", "content": prompt_sys},
        {"role": "user", "content": prompt_user},
        {"role": "assistant", "content": prompt_assistant}
    ]
}

data_format

{'messages': [{'role': 'system',
   'content': "You are an sentiment analyzer specialized in classifying sentiment of short financial texts.\nYour task is to analyze the sentiment of the provided financial text and convert it into string format. Never include any other information or strings but output formt.\n\nFollow these steps and respond only in the specified output format:\n\n# Step 1: Read the provided financial text carefully.\n\n# Step 2: Assign a sentiment score between 0 and 1 based on financial perspective.\n\n# Step 3: Do a sentimental analysis and classify it into positive, negative or neutral category and get the reason why in the financial perspective.\n\n# Step 4: Convert the classification into the specified output format.\n\n#### output format:\n<sentimental analysis>\n\n### Example\n# Text : The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of 

In [5]:
# function for random n samples / split train and test dataset
def get_random_samples(samples=df, n_random=100, test_size=0.3):
    df_sampled = df.sample(n=100)
    df_sampled_train = df_sampled.sample(round(n_random * (1 -test_size)))
    df_sampled_test = df_sampled.drop(df_sampled_train.index)
    return df_sampled_train, df_sampled_test

In [5]:
a, b = get_random_samples()

# Data Preparation

In [25]:
prompt_sys = """You are an sentiment analyzer specialized in classifying sentiment of short financial texts.
Your task is to analyze the sentiment of the provided financial text and convert it into string format. Never include any other information or strings but output formt.

Follow these steps and respond only in the specified output format:

# Step 1: Read the provided financial text carefully.

# Step 2: Assign a sentiment score between 0 and 1 based on financial perspective.

# Step 3: Do a sentimental analysis and classify it into positive, negative or neutral category and get the reason why in the financial perspective.

# Step 4: Convert the classification into the specified output format.

#### output format:
<sentimental analysis>

### Example
# Text : The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported 
# Output : negative
# Text : Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
# Output : neutral
# Text : 'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .'
# Output : positive
# Text : Rinkuskiai 's beer sales fell by 6.5 per cent to 4.16 million litres , while Kauno Alus ' beer sales jumped by 6.9 per cent to 2.48 million litres.
# Output : neutral
"""
prompt_user = f"What is the sentiment of this sentence? {a.iloc[0][1]}"
prompt_assistant = f"{a.iloc[0][0]}"
# data format : dictionary
# data_format = f"""{{"messages": [{{"role": "system", "content": "{prompt_sys}"}}, {{"role": "user", "content": "{prompt_user}"}}, {{"role": "assistant", "content": "{prompt_assistant}"}}]}}"""
data_format = {
    "messages": [
        {"role": "system", "content": prompt_sys},
        {"role": "user", "content": prompt_user},
        {"role": "assistant", "content": prompt_assistant}
    ]
}

data_format

{'messages': [{'role': 'system',
   'content': "You are an sentiment analyzer specialized in classifying sentiment of short financial texts.\nYour task is to analyze the sentiment of the provided financial text and convert it into string format. Never include any other information or strings but output formt.\n\nFollow these steps and respond only in the specified output format:\n\n# Step 1: Read the provided financial text carefully.\n\n# Step 2: Assign a sentiment score between 0 and 1 based on financial perspective.\n\n# Step 3: Do a sentimental analysis and classify it into positive, negative or neutral category and get the reason why in the financial perspective.\n\n# Step 4: Convert the classification into the specified output format.\n\n#### output format:\n<sentimental analysis>\n\n### Example\n# Text : The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of 

In [26]:
jason = json.dumps(data_format)
jason

'{"messages": [{"role": "system", "content": "You are an sentiment analyzer specialized in classifying sentiment of short financial texts.\\nYour task is to analyze the sentiment of the provided financial text and convert it into string format. Never include any other information or strings but output formt.\\n\\nFollow these steps and respond only in the specified output format:\\n\\n# Step 1: Read the provided financial text carefully.\\n\\n# Step 2: Assign a sentiment score between 0 and 1 based on financial perspective.\\n\\n# Step 3: Do a sentimental analysis and classify it into positive, negative or neutral category and get the reason why in the financial perspective.\\n\\n# Step 4: Convert the classification into the specified output format.\\n\\n#### output format:\\n<sentimental analysis>\\n\\n### Example\\n# Text : The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracte

In [6]:
def get_json_from_df(df, prompt=prompt_sys):
    # List to hold individual formatted messages
    formatted_messages = [] 
    
    # Iterate through each row in the DataFrame
    for _, row in df.iterrows():
        prompt_user = f"What is the sentiment of this sentence? {row[1]}"
        prompt_assistant = f"{row[0]}"
        data_format = {
            "messages": [
                {"role": "system", "content": prompt_sys},
                {"role": "user", "content": prompt_user},
                {"role": "assistant", "content": prompt_assistant}
            ]
        }
        formatted_messages.append(data_format)
        
    return formatted_messages

In [7]:
def to_json_file(file_path, formatted_messages):
    with open(file_path, 'w') as f:
        for message in formatted_messages:
            f.write(json.dumps(message) + '\n')

In [29]:
training_df, test_df = get_random_samples(samples=df, n_random=20)

In [30]:
data_formatted_training = get_json_from_df(training_df)
data_formatted_test = get_json_from_df(test_df)

In [31]:
file_path_train = '../data/jsonl/sentimen_analysis_finance_train_20240922.jsonl'
to_json_file(file_path_train, data_formatted_training)

In [32]:
file_path_test = '../data/jsonl/sentimen_analysis_finance_test_20240922.jsonl'
to_json_file(file_path_test, data_formatted_test)

# Explore Data

In [33]:
# format validate
openaitools_train = OpenAIFineTuneTools(file_path_train)
validate_message_train = openaitools_train.format_validate()
validate_message_train

'No error found'

In [34]:
# token count warning
openaitools_train.token_counts_warning()

Num example  missing system message: 0
Num examples missing user message: 0

### Distribution of num_message_per_example:
min / max : 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

### Distribution of num_total_tokens_per_example:
min / max : 376, 418
mean / median: 393.0, 393.0
p5 / p95: 382.3, 406.2

### Distribution of num_assistant_token_per_example:
min / max : 1, 1
mean / median: 1.0, 1.0
p5 / p95: 1.0, 1.0

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


In [35]:
# cost estimation
openaitools_train.cost_estimation()

Dataset has ~5502 tokens that will be charged for during training
By default, you'll train for 7 epochs on this dataset
By default, you'll be charged for ~38514 tokens


# Upload a training file

In [36]:
from openai import OpenAI

client = OpenAI(api_key=keyring.get_password('openai', 'key_for_windows'))
client.files.create(
    file=open('../data/jsonl/sentimen_analysis_finance_train_20240922.jsonl', 'rb'),
    purpose="fine-tune"
)

FileObject(id='file-07Go87ONQ8wDtdWrgZl1MdYM', bytes=27463, created_at=1726994724, filename='sentimen_analysis_finance_train_20240922.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [38]:
client.files.create(
    file=open('../data/jsonl/sentimen_analysis_finance_test_20240922.jsonl', 'rb'),
    purpose="fine-tune"
)

FileObject(id='file-jyjVgePzWeZnz8fdrMzh4YcG', bytes=168774, created_at=1726994789, filename='sentimen_analysis_finance_test_20240922.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

# Create a fine-tune model

In [39]:
from openai import OpenAI
client = OpenAI(api_key=keyring.get_password('openai', 'key_for_windows'))

client.fine_tuning.jobs.create(
    training_file='file-07Go87ONQ8wDtdWrgZl1MdYM',
    model='ft:gpt-4o-mini-2024-07-18:personal::AACNmEyN',
    validation_file='file-jyjVgePzWeZnz8fdrMzh4YcG'
)

FineTuningJob(id='ftjob-Nm6R5YMSMQrEKAb9iKjx9HGJ', created_at=1726994839, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='ft:gpt-4o-mini-2024-07-18:personal::AACNmEyN', object='fine_tuning.job', organization_id='org-YfLvFUJwemLm4m8qa44pA6ht', result_files=[], seed=1375213043, status='validating_files', trained_tokens=None, training_file='file-07Go87ONQ8wDtdWrgZl1MdYM', validation_file='file-jyjVgePzWeZnz8fdrMzh4YcG', estimated_finish=None, integrations=[], user_provided_suffix=None)

# Use a fine-tuned model

In [8]:
sample_data, _ = get_random_samples(samples=df, n_random=100, test_size=0)

In [9]:
# prompt for the sentimental analysis
prompt = """You are an sentiment analyzer specialized in classifying sentiment of short financial texts.
Your task is to analyze the sentiment of the provided financial text and convert it into string format. Never include any other information or strings but output formt.

Follow these steps and respond only in the specified output format:

# Step 1: Read the provided financial text carefully.

# Step 2: Assign a sentiment score between 0 and 1 based on financial perspective.

# Step 3: Do a sentimental analysis and classify it into positive, negative or neutral category and get the reason why in the financial perspective.

# Step 4: Convert the classification into the specified output format.

#### output format:
<sentimental analysis>

### Example
# Text : The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported 
# Output : negative
# Text : Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
# Output : neutral
# Text : 'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .'
# Output : positive
# Text : Rinkuskiai 's beer sales fell by 6.5 per cent to 4.16 million litres , while Kauno Alus ' beer sales jumped by 6.9 per cent to 2.48 million litres.
# Output : neutral
"""

In [2]:
## llm model
from openai import OpenAI
import keyring
import pandas as pd
# sentimental analysis

def sentiment_analysis(prompt=prompt, content=None, model='fine_tuned'):
    # clent
    client = OpenAI(api_key=keyring.get_password('openai', 'key_for_windows'))
    query = prompt + "\n\n#### Text:\n\n" + content
    # getting model's response
    if model == 'fine_tuned':
        model = 'ft:gpt-4o-mini-2024-07-18:personal::AACh9rWY'
    else:
        model = model
    
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {'role':'system', 'content':'You are a helpful assistant.'},
            {'role':'user', 'content':query}
        ]
    )
    return completion.choices[0].message.content

ModuleNotFoundError: No module named 'keyring'

In [127]:
y_pred = sentiment_analysis(content=sample_data.iloc[3][1])
y_true = sample_data.iloc[3][0]
print(y_true, y_pred)

neutral neutral


In [129]:
y_pred = sentiment_analysis(content=sample_data.iloc[3][1], model='gpt-4o-mini')
y_true = sample_data.iloc[3][0]
print(y_true, y_pred)

neutral neutral


In [135]:
## llama model
from openai import OpenAI
from langchain_community.chat_models import ChatOllama
import keyring
import pandas as pd
# sentimental analysis

def sentiment_analysis_llama(prompt=prompt, content=None):
    # clent
    client = ChatOllama(model='llama3.1')
    query = prompt + "\n\n#### Text:\n\n" + content
    # getting model's response
    
    completion = client.invoke(query)
    return completion.content

In [130]:
# extract random 100 test case with 10 iterations
import random
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')       # warning message not showing


# fine tuned model
accuracy_list = []

for n in tqdm(range(3)):
    y_true_list = []
    y_pred_list = []
    sample_data, _ = get_random_samples(samples=df, n_random=10, test_size=0)
    accuracy = 0
    for i, d in enumerate(sample_data.iterrows()):
        y_true = sample_data.iloc[i][0]
        y_pred = sentiment_analysis(content=sample_data.iloc[i][1])
        y_true_list.append(y_true)
        y_pred_list.append(y_pred)
    accuracy = accuracy_score(y_true_list, y_pred_list)
    print(accuracy)
    accuracy_list.append(accuracy)
accuracy_list   

  0%|          | 0/3 [00:00<?, ?it/s]

0.8
0.9
0.8


[0.8, 0.9, 0.8]

In [132]:
# gpt-4o-mini model
accuracy_list = []

for n in tqdm(range(3)):
    y_true_list = []
    y_pred_list = []
    sample_data, _ = get_random_samples(samples=df, n_random=10, test_size=0)
    accuracy = 0
    for i, d in enumerate(sample_data.iterrows()):
        y_true = sample_data.iloc[i][0]
        y_pred = sentiment_analysis(content=sample_data.iloc[i][1], model='gpt-4o-mini')
        y_true_list.append(y_true)
        y_pred_list.append(y_pred)
    accuracy = accuracy_score(y_true_list, y_pred_list)
    print(accuracy)
    accuracy_list.append(accuracy)
accuracy_list

  0%|          | 0/3 [00:00<?, ?it/s]

0.9
0.8
0.8


[0.9, 0.8, 0.8]

In [136]:
# llama3.1
accuracy_list = []

for n in tqdm(range(3)):
    y_true_list = []
    y_pred_list = []
    sample_data, _ = get_random_samples(samples=df, n_random=10, test_size=0)
    accuracy = 0
    for i, d in enumerate(sample_data.iterrows()):
        y_true = sample_data.iloc[i][0]
        y_pred = sentiment_analysis_llama(content=sample_data.iloc[i][1])
        y_true_list.append(y_true)
        y_pred_list.append(y_pred)
    accuracy = accuracy_score(y_true_list, y_pred_list)
    print(accuracy)
    accuracy_list.append(accuracy)
accuracy_list

  0%|          | 0/3 [00:00<?, ?it/s]

0.6
0.5
0.6


[0.6, 0.5, 0.6]