In [2]:
import pandas as pd
import matplotlib.pyplot as plt

## Fine-Tuning LLaVA on the RMET

We are looking at our results of fine-tuning the LLaVA model on the RMET task. We are looking at 5 different versions of the models: the base model and 4 fine-tuned models with different levels of training. 

### 1 RMET data

#### 1.1 Loading and checking the data

In [94]:
base1 = pd.read_csv('rmet_results/rmet_base-1.txt')
base2 = pd.read_csv('rmet_results/rmet_base-2.txt')
base3 = pd.read_csv('rmet_results/rmet_base-3.txt')
base4 = pd.read_csv('rmet_results/rmet_base-4.txt')
base5 = pd.read_csv('rmet_results/rmet_base-5.txt')
ep1_1 = pd.read_csv('rmet_results/rmet_1ep-1.txt')
ep1_2 = pd.read_csv('rmet_results/rmet_1ep-2.txt')
ep1_3 = pd.read_csv('rmet_results/rmet_1ep-3.txt')
ep1_4 = pd.read_csv('rmet_results/rmet_1ep-4.txt')
ep1_5 = pd.read_csv('rmet_results/rmet_1ep-5.txt')
ep5_1 = pd.read_csv('rmet_results/rmet_5ep-1.txt')
ep5_2 = pd.read_csv('rmet_results/rmet_5ep-2.txt')
ep5_3 = pd.read_csv('rmet_results/rmet_5ep-3.txt')
ep5_4 = pd.read_csv('rmet_results/rmet_5ep-4.txt')
ep5_5 = pd.read_csv('rmet_results/rmet_5ep-5.txt')
ep7_1 = pd.read_csv('rmet_results/rmet_7ep-1.txt')
ep7_2 = pd.read_csv('rmet_results/rmet_7ep-2.txt')
ep7_3 = pd.read_csv('rmet_results/rmet_7ep-3.txt')
ep10_1 = pd.read_csv('rmet_results/rmet_10ep-1.txt')
ep10_2 = pd.read_csv('rmet_results/rmet_10ep-2.txt')
# ep10_3 = pd.read_csv('rmet_results/rmet_10ep-3.txt')

In [95]:
rmet = pd.concat([base1, base2, base3, base4, base5, ep1_1, ep1_2, ep1_3, ep1_4, ep1_5, ep5_1, ep5_2, ep5_3, ep5_4, ep5_5, ep7_1, ep7_2, ep7_3, ep10_1, ep10_2], axis=1)

### 2 Task performance


#### 2.1 Load answers

In [96]:
answers_file = 'rmet_materials/answers.txt'
with open(answers_file, 'r') as file:
    answers = [line.strip() for line in file.readlines()]

In [97]:
answers[:10]

['playful',
 'upset',
 'desire',
 'insisting',
 'worried',
 'fantasizing',
 'uneasy',
 'despondent',
 'preoccupied',
 'cautious']

#### 2.2 Check responses

In [98]:
performance = rmet.copy()

In [99]:
for model in performance.columns:
    performance[model] = [1 if performance[model][i] == answers[i] else 0 for i in range(len(answers))]

#### 2.3 Calculate number correct

In [100]:
num_correct = performance.sum(axis=0)

In [101]:
num_correct

llava_base-1    17
llava_base-2    20
llava_base-3    17
llava_base-4    20
llava_base-5    20
llava_1ep-1     22
llava_1ep-2     20
llava_1ep-3     21
llava_1ep-4     21
llava_1ep-5     21
llava_5ep-1     20
llava_5ep-2     23
llava_5ep-3     21
llava_5ep-4     22
llava_5ep-5     21
llava_7ep-1     22
llava_7ep-2     19
llava_7ep-3     22
llava_10ep-1    22
llava_10ep-2    21
dtype: int64

#### 2.4 Calculate Performance Increase (from base model)

In [102]:
increase_performance = pd.DataFrame(num_correct)
increase_performance.columns = ['num_correct']

In [103]:
# Merge model types
increase_performance['model'] = [idx.split('-')[0] for idx in increase_performance.index]

In [104]:
model_performance = increase_performance.groupby('model')['num_correct'].mean()

In [105]:
model_performance = model_performance.reset_index()
model_performance.columns = ['model', 'num_correct']

In [106]:
model_performance

Unnamed: 0,model,num_correct
0,llava_10ep,21.5
1,llava_1ep,21.0
2,llava_5ep,21.4
3,llava_7ep,21.0
4,llava_base,18.8


In [92]:
model_performance['percent'] = model_performance['num_correct'].apply(lambda x: x / 36)
model_performance['improvement'] = model_performance['percent'].apply(lambda x: x - model_performance.iloc[4, 2])

In [93]:
model_performance

Unnamed: 0,model,num_correct,percent,improvement
0,llava_10ep,21.5,0.597222,0.075
1,llava_1ep,21.0,0.583333,0.061111
2,llava_5ep,21.4,0.594444,0.072222
3,llava_7ep,21.666667,0.601852,0.07963
4,llava_base,18.8,0.522222,0.0
