# Fetching datasets

In [19]:
import pandas as pd
import os
from classes.finetuning.preprocessing import Preprocessor

preoprocessor = Preprocessor()

## Fetching Openhermes

In [20]:
url_openhermes = "https://datasets-server.huggingface.co/rows?dataset=teknium%2Fopenhermes&config=default&split=train&"
df_openhermes = pd.DataFrame()

try:
    path = os.path.join(os.getcwd(), "backups", "finetuning", "df_openhermes.pkl")

    if os.path.exists(path):
        df_openhermes = pd.read_pickle(path)
        print(f"Backup loaded: {path}")
except Exception as e:
    print(f"An exception occurred: {str(e)}")

if df_openhermes.empty:
    df_openhermes = preoprocessor.fetch_rows(url_openhermes, 5000)
    df_openhermes = preoprocessor.df_openhermes_preproc(df_openhermes)

df_openhermes

Backup loaded: /workspaces/datamanagement_project/backups/finetuning/df_openhermes.pkl


Unnamed: 0,question,answer
0,Write a Perl script that processes a log file ...,```perl\n#!/usr/bin/perl\n\nuse strict;\nuse w...
1,"What can be seen once in a minute, twice in a ...",The letter 'M'.
2,Famous inventors and their inventions: Identif...,1. Thomas Edison: One of his most significant ...
3,Generate a list of 12 words that start with 'qu'.,1. Quail\n2. Quarry\n3. Quasar\n4. Quench\n5. ...
4,"Who was the first woman to win a Nobel Prize, ...",Marie Curie; Physics
...,...,...
4995,"BEGININPUT\nBEGINCONTEXT\ndate: June 12, 2023\...",Some popular sustainable design features menti...
4996,Write an Erlang function that creates a distri...,To create a distributed key-value store using ...
4997,BEGININPUT\nBEGINCONTEXT\nauthor: Orlo Orloff\...,| Name | Sport | Score ...
4998,"Alice had 10 marbles, gave 3 to Bob, and then ...","This question is about marbles, not cookies. A..."


## Fetching SlimOrca

In [21]:
url_slimOrca = "https://datasets-server.huggingface.co/rows?dataset=Open-Orca%2FSlimOrca&config=default&split=train&"
df_slimOrca_clean = pd.DataFrame()

try:
    path = os.path.join(os.getcwd(), "backups", "finetuning", "df_slimOrca_clean.pkl")

    if os.path.exists(path):
        df_slimOrca_clean = pd.read_pickle(path)
        print(f"Backup loaded: {path}")
except Exception as e:
    print(f"An exception occurred: {str(e)}")

if df_slimOrca_clean.empty:
    df_slimOrca = preoprocessor.fetch_rows(url_slimOrca, 5000)
    df_slimOrca_clean = preoprocessor.df_slimOrca_preproc(df_slimOrca)

df_slimOrca_clean

Backup loaded: /workspaces/datamanagement_project/backups/finetuning/df_slimOrca_clean.pkl


Unnamed: 0,question,answer
0,"Write an article based on this ""A man has been...",Title: Tragedy Strikes in Sydney: Victims Stab...
1,Answer the following question: - number is 54 ...,The information provided seems to refer to Ria...
2,Produce a long descriptive sentence that uses ...,"Stretching across a vast areaOfLand, totaling ..."
3,Write a title for this article:\n\nArbitration...,"""The Sneaky Clauses Taking Away Your Day in Co..."
4,"Definition: In this task, you are given a hate...",geopolitical\n\nStep 1: Understand the text\nI...
...,...,...
4995,Here is an article:\n\nOhio high school senior...,"Ohio High School Student Wins $250,000 Scholar..."
4996,Q:Answer the following question given this par...,"The correct answer is called ""epistasis."" When..."
4997,"Teacher:In this task, you are given two phrase...","Yes\nExplanation: In this problem, the Head is..."
4998,,


## Merging the two cleaned datasets

In [22]:
df = pd.concat([df_openhermes, df_slimOrca_clean])
df = df.assign(language="en", accuracy=-1, acc_explanation="")
df

Unnamed: 0,question,answer,language,accuracy,acc_explanation
0,Write a Perl script that processes a log file ...,```perl\n#!/usr/bin/perl\n\nuse strict;\nuse w...,en,-1,
1,"What can be seen once in a minute, twice in a ...",The letter 'M'.,en,-1,
2,Famous inventors and their inventions: Identif...,1. Thomas Edison: One of his most significant ...,en,-1,
3,Generate a list of 12 words that start with 'qu'.,1. Quail\n2. Quarry\n3. Quasar\n4. Quench\n5. ...,en,-1,
4,"Who was the first woman to win a Nobel Prize, ...",Marie Curie; Physics,en,-1,
...,...,...,...,...,...
4995,Here is an article:\n\nOhio high school senior...,"Ohio High School Student Wins $250,000 Scholar...",en,-1,
4996,Q:Answer the following question given this par...,"The correct answer is called ""epistasis."" When...",en,-1,
4997,"Teacher:In this task, you are given two phrase...","Yes\nExplanation: In this problem, the Head is...",en,-1,
4998,,,en,-1,


## Splitting the dataset

In [23]:
import numpy as np

# Split the DataFrame into two parts with 50% of the shuffled rows
df_split = np.array_split(df.sample(frac=1, axis=0, random_state=42).sample(frac=1).reset_index(drop=True), 2)
df_split[0]

  return bound(*args, **kwds)


Unnamed: 0,question,answer,language,accuracy,acc_explanation
0,"Data: name = The Plough, eatType = pub, food =...","To form a sentence using the provided data, I ...",en,-1,
1,Given the question: Information: - Charles Fe...,1995,en,-1,
2,"Which musical instrument, commonly used in jaz...",String Bass,en,-1,
3,Sentiment possibilities Possible answers: [i] ...,"""🌞 Rise and shine, beautiful souls! Today's a ...",en,-1,
4,Create a Python script that uses the Azure Blo...,To create a Python script that uses the Azure ...,en,-1,
...,...,...,...,...,...
4995,Write a sentence that about [Bacon Explosion C...,"The Bacon Explosion, a mouthwatering dish orig...",en,-1,
4996,Write a Ruby script that parses an XML documen...,"To parse an XML document in Ruby, you can use ...",en,-1,
4997,The Office Chair Designed to Restore Your Focu...,1. Modern work environment problems: Workers a...,en,-1,
4998,Determine the eigenvalues and eigenvectors of ...,"To find the eigenvalues, we need to solve the ...",en,-1,


## Translating the Split

In [24]:
from classes.finetuning.translator import Translator

translator = Translator()

df_translated = translator.translate(df_split[0])
df_translated

Searching backup in: ['/workspaces/datamanagement_project/backups/finetuning/0122_5000_translated.pkl']
Backup found: /workspaces/datamanagement_project/backups/finetuning/0122_5000_translated.pkl
Backup loaded: /workspaces/datamanagement_project/backups/finetuning/0122_5000_translated.pkl


Unnamed: 0,question,answer,language,accuracy,acc_explanation
5000,"Se getti una pietra rossa nel mare blu, cosa d...",Bagnato o sommerso.,it,-1,
5001,"Dada la definición de la tarea y los aportes, ...","Para llegar a la respuesta, analicé el comenta...",es,-1,
5002,Question : Qui tue Maléfique ? S'il n'y a pas ...,Informations insuffisantes pour apporter une r...,fr,-1,
5003,Ti viene fornita una dichiarazione scritta in ...,ಬಹಾಮಾಸ್,it,-1,
5004,Responda la siguiente pregunta: Estoy haciendo...,C: Mucha gente tiene una idea errónea de la re...,es,-1,
...,...,...,...,...,...
9995,Develop a C++ program that calculates the fact...,Here's a simple C++ program that calculates th...,it,-1,
9996,Describe el proceso creativo detrás del diseño...,"El diseño de la portada del álbum ""Dark Side o...",es,-1,
9997,"au départ, il y avait 32 indicateurs de réussi...","Initialement, il y avait 32 indicateurs de réu...",fr,-1,
9998,Entri in una stanza con un fiammifero. C'è una...,Accendi prima il fiammifero.\n\nEsempio 9:,it,-1,


## Paper implementation

### Evaluator LLM Setup

In [27]:
from classes.llm.evaluator import Evaluator
from classes.llm.gemini import Gemini

evaluator = Evaluator()
gemini = Gemini()

df_to_evaluate = pd.concat([df_split[0], df_translated])

Searching backup in: ['/workspaces/datamanagement_project/backups/finetuning/0122_10000_evaluated.pkl']
Backup found: /workspaces/datamanagement_project/backups/finetuning/0122_10000_evaluated.pkl
Backup loaded: /workspaces/datamanagement_project/backups/finetuning/0122_10000_evaluated.pkl


In [28]:
if evaluator.df_to_evaluate is not None or evaluator.df_to_evaluate.empty:
    df_to_evaluate = evaluator.df_to_evaluate
    
df_to_evaluate

Unnamed: 0,question,answer,language,accuracy,acc_explanation
0,"On the Origin of Species:\n\nBy the mid-1870s,...",Saltationism means that new species were thoug...,en,5,\nExplanation:\nThe response accurately define...
1,Read the text and answer the questions.\n\nWar...,1. The music assets were sold in 2003.\n2. The...,en,5,Explanation: The response accurately answers b...
2,"At a playground, there are four children: Alic...",The children think that a dog ran off with bot...,en,5,\nExplanation:\nThe response accurately reflec...
3,Compile a list of 19 words that commence with ...,1. Reassemble\n2. Reassure\n3. Rebuild\n4. Rec...,en,5,\nExplanation: The AI assistant's response is ...
4,Here is some data about a restaurant: name = A...,Step 1: Identify the given information - In th...,en,5,\nExplanation:\nThe response accurately follow...
...,...,...,...,...,...
9995,Develop a C++ program that calculates the fact...,Here's a simple C++ program that calculates th...,it,5,\nExplanation:\nThe response accurately addres...
9996,Describe el proceso creativo detrás del diseño...,"El diseño de la portada del álbum ""Dark Side o...",es,5,\nExplanation:\nThe response accurately and co...
9997,"au départ, il y avait 32 indicateurs de réussi...","Initialement, il y avait 32 indicateurs de réu...",fr,5,Explanation: The response is accurate and comp...
9998,Entri in una stanza con un fiammifero. C'è una...,Accendi prima il fiammifero.\n\nEsempio 9:,it,5,\nExplanation:\nThe response is accurate and c...


In [29]:
df_to_evaluate =  evaluator.evaluate(df_to_evaluate, gemini, evaluator)
df_to_evaluate

10000it [00:00, 21047.78it/s]


Unnamed: 0,question,answer,language,accuracy,acc_explanation
0,"On the Origin of Species:\n\nBy the mid-1870s,...",Saltationism means that new species were thoug...,en,5,\nExplanation:\nThe response accurately define...
1,Read the text and answer the questions.\n\nWar...,1. The music assets were sold in 2003.\n2. The...,en,5,Explanation: The response accurately answers b...
2,"At a playground, there are four children: Alic...",The children think that a dog ran off with bot...,en,5,\nExplanation:\nThe response accurately reflec...
3,Compile a list of 19 words that commence with ...,1. Reassemble\n2. Reassure\n3. Rebuild\n4. Rec...,en,5,\nExplanation: The AI assistant's response is ...
4,Here is some data about a restaurant: name = A...,Step 1: Identify the given information - In th...,en,5,\nExplanation:\nThe response accurately follow...
...,...,...,...,...,...
9995,Develop a C++ program that calculates the fact...,Here's a simple C++ program that calculates th...,it,5,\nExplanation:\nThe response accurately addres...
9996,Describe el proceso creativo detrás del diseño...,"El diseño de la portada del álbum ""Dark Side o...",es,5,\nExplanation:\nThe response accurately and co...
9997,"au départ, il y avait 32 indicateurs de réussi...","Initialement, il y avait 32 indicateurs de réu...",fr,5,Explanation: The response is accurate and comp...
9998,Entri in una stanza con un fiammifero. C'è una...,Accendi prima il fiammifero.\n\nEsempio 9:,it,5,\nExplanation:\nThe response is accurate and c...


In [30]:
from classes.database import DatabaseHandler

db_handler = DatabaseHandler()

In [None]:
# db_handler.insert_data(to_save_dict)