## Processing datasets

This notebook prepares the data for LoRA vs full finetuning analysis.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import textwrap
import random 
import ast
import re
from tqdm import tqdm
import os
tqdm.pandas()

In [2]:
# Function to extract context length from df 

def context_processing(x):
    context_ids, token = x["in_token_ids"], x["curr_token_id"]
    context = ast.literal_eval(re.split(r'tensor\(|\].*', context_ids)[1] + ']')
    x["context_len"] = len(context)
    x["token_in_context"] = int(token in context)
    x["uniq_ctxt_tkns_count"] = len(set(context))
    return x
    
def get_context(x):
    return ast.literal_eval(re.split(r'tensor\(|\].*', x)[1] + ']')

In [3]:
lora_model_rank = 16

In [5]:
lora_path = f"../results/pythia-1.4b/lora/r_{lora_model_rank}/lr_2e-4/early_stopping/num_train_4096/bsize_128/seed_1/tkn_freq_probs_best.csv"
lora_df = pd.read_csv(lora_path)

full_path = f"../results/pythia-1.4b/full-ft/lr_2e-6/early_stopping/num_train_4096/bsize_128/tkn_freq_probs_best.csv"
full_df = pd.read_csv(full_path)

base_path = f"../results/pythia-1.4b/base_model/num_train_4096/tkn_freq_probs_base.csv"
base_df = pd.read_csv(base_path)

In [8]:
lora_df = lora_df.apply(context_processing, axis=1)
full_df = full_df.apply(context_processing, axis=1)

In [12]:
lora_df = lora_df.rename(columns={
    "curr_token_prob": "lora_prob", 
    "curr_token_rank": "lora_rank", 
    "top_k_pred_tokens": "lora_top_k_pred_tokens", 
    "top_k_pred_probs": "lora_top_k_pred_probs"
    })
full_df = full_df.rename(columns={
    "curr_token_prob": "full_prob", 
    "curr_token_rank": "full_rank", 
    "top_k_pred_tokens": "full_top_k_pred_tokens", 
    "top_k_pred_probs": "full_top_k_pred_probs"
    })
base_df = base_df.rename(columns={
    "curr_token_prob": "base_prob", 
    "curr_token_rank": "base_rank", 
    "top_k_pred_tokens": "base_top_k_pred_tokens", 
    "top_k_pred_probs": "base_top_k_pred_probs"
    })

In [4]:
lora_df_dir = f"results/data/lora_model/rank/r_{lora_model_rank}/finetuning_data"
base_df_dir = f"results/data/base_model/finetuning_data"
full_df_dir = f"results/data/full_model/finetuning_data"

In [7]:
os.makedirs(lora_df_dir, exist_ok=True)
lora_save_path = os.path.join(lora_df_dir, "lora_data_probs.csv")

os.makedirs(base_df_dir, exist_ok=True)
base_save_path = os.path.join(base_df_dir, "base_data_probs.csv")

os.makedirs(full_df_dir, exist_ok=True)
full_save_path = os.path.join(full_df_dir, "full_data_probs.csv")

In [None]:
# Save the processed data
lora_df.to_csv(lora_save_path, index=False)
full_df.to_csv(full_save_path, index=False)
base_df.to_csv(base_save_path, index=False)

In [29]:
# Load the processed data
lora_df = pd.read_csv(lora_save_path)
full_df = pd.read_csv(full_save_path)
base_df = pd.read_csv(base_save_path)

In [30]:
# Function to convert string representation of np array to array
def parse_array_string(array_str):
    try:
        cleaned_str = '[' + ', '.join(array_str.strip('[]').split()) + ']'
        return ast.literal_eval(cleaned_str)
    except:
        return []  # Return empty array if parsing fails

# Convert array columns - adjust column names as needed
array_columns = ['lora_top_k_pred_probs', 'full_top_k_pred_probs', 'base_top_k_pred_probs', 'lora_top_k_pred_tokens', 'full_top_k_pred_tokens', 'base_top_k_pred_tokens']
for df in [lora_df, full_df, base_df]:
    for col in array_columns:
        if col in df.columns:
            df[col] = df[col].apply(parse_array_string)

In [31]:
common_cols = set(lora_df.columns).intersection(set(full_df.columns))
full_df.drop(common_cols, axis=1, inplace=True)
common_cols = common_cols.intersection(set(base_df.columns))
base_df.drop(common_cols, axis=1, inplace=True)
df_combined = pd.concat([lora_df, full_df, base_df], axis=1)
df_combined["full_lora_diff"] = df_combined.full_prob - df_combined.lora_prob
df_combined["lora_base_diff"] = df_combined.lora_prob - df_combined.base_prob
df_combined["full_base_diff"] = df_combined.full_prob - df_combined.base_prob

df_combined["full_lora_tkn_rank_diff"] = df_combined.full_rank - df_combined.lora_rank
df_combined["lora_base_tkn_rank_diff"] = df_combined.lora_rank - df_combined.base_rank
df_combined["full_base_tkn_rank_diff"] = df_combined.full_rank - df_combined.base_rank

In [32]:
sorted_diffs = df_combined.sort_values("full_lora_diff")
df_selected = sorted_diffs[["seq_id", "in_tokens", "context_len", "token_in_context", "uniq_ctxt_tkns_count", "prev_token", "curr_token", "pmi", "curr_token_freq", "prev_token_freq", "pair_token_freq", "lora_prob", "full_prob", "base_prob", "full_lora_diff", "lora_base_diff", "full_base_diff", "lora_rank", "full_rank", "base_rank", "lora_top_k_pred_tokens", "full_top_k_pred_tokens", "base_top_k_pred_tokens", "lora_top_k_pred_probs", "full_top_k_pred_probs", "base_top_k_pred_probs", "full_lora_tkn_rank_diff", "lora_base_tkn_rank_diff", "full_base_tkn_rank_diff"]]
finetune_df = df_selected
finetune_df = finetune_df.dropna().reset_index(drop=True)
df_selected = df_selected[(df_selected["pmi"] > -4.75) & (df_selected["pmi"] < -3.75)]
# find unique pairs of w_{i-1}, w_i
agg_fns = {c: 'first' for c in df_selected.columns if c not in ["curr_token"]}
agg_fns["full_lora_diff"] = "min"
df_uniq = df_selected.groupby(["curr_token"]).agg(agg_fns).reset_index().sort_values("full_lora_diff")

top_100 = df_uniq[:100]
top_100.loc[:, ["seq_id", "pmi", "curr_token_freq", "lora_prob", "full_prob", "base_prob", "full_lora_diff", "lora_base_diff", "full_base_diff"]] = top_100[["seq_id", "pmi", "curr_token_freq", "lora_prob", "full_prob", "base_prob", "full_lora_diff", "lora_base_diff", "full_base_diff"]].round(3)
top_100.to_csv(f"results/examples_ft_pt/rank/r_{lora_model_rank}/finetune_data_100.csv", index=False)
top_100.head()

Unnamed: 0,curr_token,seq_id,in_tokens,context_len,token_in_context,uniq_ctxt_tkns_count,prev_token,pmi,curr_token_freq,prev_token_freq,...,base_rank,lora_top_k_pred_tokens,full_top_k_pred_tokens,base_top_k_pred_tokens,lora_top_k_pred_probs,full_top_k_pred_probs,base_top_k_pred_probs,full_lora_tkn_rank_diff,lora_base_tkn_rank_diff,full_base_tkn_rank_diff
1181,Ġ@,3747,Mogadishu University ( MU ) is a non,12,0,12,Ġnon,-4.023,5438,55,...,32569,"[1214, 1108, 428, 24715, 11528, 11073, 5702, 1...","[14, 1108, 11528, 24715, 1214, 428, 2208, 2003...","[14, 11528, 24715, 2208, 2003, 3250, 7338, 374...","[0.962, 0.019, 0.003, 0.001, 0.001, 0.001, 0.0...","[0.598, 0.032, 0.031, 0.031, 0.022, 0.022, 0.0...","[0.911, 0.026, 0.011, 0.004, 0.004, 0.003, 0.0...",4,-32568,-32564
2,.,2708,Ceratopsia or Ceratopia ( / ˌsɛrəˈtɒpsiə / or...,116,0,83,Ġ@,-3.831,1822,5438,...,5,"[15, 14, 13, 1253, 8634, 283, 21549, 15770, 42...","[13, 15, 14, 1157, 904, 23659, 84, 15770, 1939...","[285, 495, 608, 281, 15, 14, 1253, 428, 374, 933]","[0.946, 0.049, 0.003, 0.001, 0.0, 0.0, 0.0, 0....","[0.823, 0.122, 0.052, 0.001, 0.001, 0.0, 0.0, ...","[0.074, 0.015, 0.014, 0.011, 0.011, 0.01, 0.01...",1,-4,-3
1107,Ġ5,2652,"After the war, the State Government negotiate...",36,0,32,Ġ(,-3.927,445,2660,...,33,"[608, 577, 8073, 8676, 2456, 8255, 8319, 470, ...","[2456, 8319, 8073, 608, 8676, 7584, 8255, 5693...","[2666, 18, 23, 19, 9887, 24, 25, 21, 22, 17]","[0.912, 0.013, 0.009, 0.009, 0.007, 0.007, 0.0...","[0.256, 0.176, 0.129, 0.128, 0.073, 0.061, 0.0...","[0.03, 0.022, 0.02, 0.019, 0.018, 0.016, 0.015...",3,-32,-29
763,Ġ',2397,""" Hollywood "" was released as the album",8,0,7,Ġalbum,-4.215,4257,468,...,69,"[686, 256, 1214, 346, 434, 4060, 285, 11386, 2...","[434, 686, 346, 4060, 285, 11386, 369, 1214, 2...","[434, 4060, 457, 11386, 273, 369, 37835, 3540,...","[0.878, 0.04, 0.014, 0.013, 0.009, 0.005, 0.00...","[0.499, 0.114, 0.062, 0.038, 0.033, 0.03, 0.02...","[0.889, 0.015, 0.008, 0.008, 0.007, 0.005, 0.0...",1,-68,-67
699,us,1551,"Jason Sudeikis, who played Floyd in this epis...",108,0,81,f,-4.593,285,141,...,2,"[316, 1316, 528, 26407, 899, 20191, 3016, 3635...","[1316, 316, 528, 20191, 26407, 375, 265, 2327,...","[1316, 316, 528, 13, 20191, 5123, 47291, 375, ...","[0.952, 0.046, 0.001, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.799, 0.193, 0.005, 0.001, 0.0, 0.0, 0.0, 0....","[0.857, 0.127, 0.004, 0.001, 0.001, 0.001, 0.0...",1,-1,0


In [33]:
finetune_df.to_csv(f"results/data/lora_model/rank/r_{lora_model_rank}/finetuning_data/finetune_data_probs.csv", index=False)

## Pretraining examples

In [15]:
lora_path = "../results/pythia-1.4b/lora/r_16/lr_2e-4/early_stopping/pretraining/tkn_freq_probs_best.csv"
lora_df = pd.read_csv(lora_path)
full_path = "../results/pythia-1.4b/full-ft/lr_2e-6/early_stopping/pretraining/tkn_freq_probs_best.csv"
full_df = pd.read_csv(full_path)
base_path = "../results/pythia-1.4b/base_model/pretraining/tkn_freq_probs_base.csv"
base_df = pd.read_csv(base_path)

In [16]:
lora_df["in_token_ids"]

0                          tensor([1413], dtype=torch.int32)
1                    tensor([1413,   27], dtype=torch.int32)
2              tensor([1413,   27,   49], dtype=torch.int32)
3          tensor([1413,   27,   49,  363], dtype=torch.i...
4          tensor([1413,   27,   49,  363, 2721], dtype=t...
                                 ...                        
2097148    tensor([  273, 32212,   267,  6943, 18334,  11...
2097149    tensor([32212,   267,  6943, 18334,  1119, 313...
2097150    tensor([  267,  6943, 18334,  1119, 31388,    ...
2097151    tensor([ 6943, 18334,  1119, 31388,    71,   5...
2097152    tensor([18334,  1119, 31388,    71,   579,   9...
Name: in_token_ids, Length: 2097153, dtype: object

In [42]:
lora_df = lora_df.dropna().reset_index(drop=True)
cp = ContextProcessor(lora_df)
lora_df["context_len"]= cp.get_context_len()
lora_df["token_in_context"]= cp.is_token_in_context()
lora_df["uniq_ctxt_tkns_count"]= cp.uniq_ctxt_tkns_count()

In [43]:
full_df = full_df.dropna().reset_index(drop=True)
cp = ContextProcessor(full_df)
full_df["context_len"]= cp.get_context_len()
full_df["token_in_context"]= cp.is_token_in_context()
full_df["uniq_ctxt_tkns_count"]= cp.uniq_ctxt_tkns_count()

 53%|█████▎    | 1111028/2096781 [03:07<02:40, 6136.78it/s]IOStream.flush timed out
100%|██████████| 2096781/2096781 [06:02<00:00, 5792.12it/s] 


In [44]:
lora_df = lora_df.rename(columns={"curr_token_prob": "lora_prob"})
full_df = full_df.rename(columns={"curr_token_prob": "full_prob"})
base_df = base_df.rename(columns={"curr_token_prob": "base_prob"})
common_cols = set(lora_df.columns).intersection(set(full_df.columns))
full_df.drop(common_cols, axis=1, inplace=True)
common_cols = common_cols.intersection(set(base_df.columns))
base_df.drop(common_cols, axis=1, inplace=True)
df_combined = pd.concat([lora_df, full_df, base_df], axis=1)
df_combined["full_lora_diff"] = df_combined.full_prob - df_combined.lora_prob
df_combined["lora_base_diff"] = df_combined.lora_prob - df_combined.base_prob
df_combined["full_base_diff"] = df_combined.full_prob - df_combined.base_prob

In [45]:
sorted_diffs = df_combined.sort_values("full_lora_diff", ascending=False)
df_selected = sorted_diffs[["in_tokens", "context_len", "token_in_context", "uniq_ctxt_tkns_count", "prev_token", "curr_token", "pmi", "curr_token_freq", "prev_token_freq", "pair_token_freq", "lora_prob", "full_prob", "base_prob", "full_lora_diff", "lora_base_diff", "full_base_diff"]]
pretrain_df = df_selected
pretrain_df = pretrain_df.dropna().reset_index(drop=True)

In [46]:
df_selected = pretrain_df[(pretrain_df["pmi"] > -5) & (pretrain_df["pmi"] < -4)]

# find unique pairs of w_{i-1}, w_i
agg_fns = {c: 'first' for c in pretrain_df.columns if c not in ["curr_token"]}
agg_fns["full_lora_diff"] = "max"
df_uniq = pretrain_df.groupby(["curr_token"]).agg(agg_fns).reset_index().sort_values("full_lora_diff", ascending=False)

top_100 = df_uniq[:100]

top_100.loc[:, ["pmi", "curr_token_freq", "lora_prob", "full_prob", "base_prob", "full_lora_diff", "lora_base_diff", "full_base_diff"]] = top_100[["pmi", "curr_token_freq", "lora_prob", "full_prob", "base_prob", "full_lora_diff", "lora_base_diff", "full_base_diff"]].round(3)
top_100.to_csv("results/examples_ft_pt/pretrain_data_100.csv", index=False)
top_100.head()

Unnamed: 0,curr_token,in_tokens,context_len,token_in_context,uniq_ctxt_tkns_count,prev_token,pmi,curr_token_freq,prev_token_freq,pair_token_freq,lora_prob,full_prob,base_prob,full_lora_diff,lora_base_diff,full_base_diff
4983,PEC,and no diagonal that accepted 2 inch eyepiece...,129.0,0.0,102.0,Ċ,-4.907,27.0,79130.0,999.0,0.0,1.0,0.949,1.0,-0.949,0.051
566,-,iance occurred when light passed from air to w...,129.0,1.0,74.0,ref,-4.013,22036.0,1195.0,30106.0,0.0,1.0,0.015,1.0,-0.015,0.985
966,107,2) 0.0005 (15) 0.0052 (16) 0.0093 (1...,129.0,0.0,38.0,.,-4.589,36.0,72634.0,1680.0,0.0,1.0,1.0,1.0,-1.0,-0.0
10475,gre,outrun the hare.\n\nIn the night\n\nhis eyes ...,67.0,0.0,44.0,Ċ,-4.6,22.0,79130.0,1106.0,0.0,1.0,0.198,1.0,-0.198,0.802
9333,edge,_Tapiola_\n\nHe is no more dead than Finland ...,129.0,0.0,86.0,Ċ,-4.274,22.0,79130.0,1533.0,0.0,1.0,0.677,1.0,-0.677,0.323


### Merge pretrain/finetune data with finetune/pretrain token frequencies

In [47]:
pt_data_stats = pretrain_df[["prev_token", "curr_token", "curr_token_freq", "prev_token_freq", "pair_token_freq", "pmi"]]
ft_data_stats = finetune_df[["prev_token", "curr_token", "curr_token_freq", "prev_token_freq", "pair_token_freq", "pmi"]]

pt_data_stats = pt_data_stats.add_prefix('pt_').drop_duplicates().reset_index(drop=True)
pt_data_stats = pt_data_stats.rename(columns={"pt_prev_token": "prev_token", "pt_curr_token": "curr_token"})
ft_data_stats = ft_data_stats.add_prefix('ft_').drop_duplicates().reset_index(drop=True)
ft_data_stats = ft_data_stats.rename(columns={"ft_prev_token": "prev_token", "ft_curr_token": "curr_token"})

In [48]:
pt_pairs = pt_data_stats[["curr_token", "prev_token", "pt_pmi"]]
ft_pairs = ft_data_stats[["curr_token", "prev_token", "ft_pmi"]]

common_pairs = pd.merge(pt_pairs, ft_pairs, on=["curr_token", "prev_token"])

In [49]:
pt_token_freq = pt_data_stats[["curr_token", "pt_curr_token_freq"]]
pt_token_freq = pt_token_freq.drop_duplicates().reset_index(drop=True)
finetune_df_merged = pd.merge(finetune_df, pt_token_freq, on=["curr_token"], how="left")
finetune_df_merged["pt_curr_token_freq"] = finetune_df_merged["pt_curr_token_freq"].fillna(0)
finetune_df_merged = finetune_df_merged.dropna().reset_index(drop=True)


ft_token_freq = ft_data_stats[["curr_token", "ft_curr_token_freq"]]
ft_token_freq = ft_token_freq.drop_duplicates().reset_index(drop=True)
pretrain_df_merged = pd.merge(pretrain_df, ft_token_freq, on=["curr_token"], how="left")
pretrain_df_merged["pt_curr_token_freq"] = pretrain_df_merged["ft_curr_token_freq"].fillna(0)
pretrain_df_merged = pretrain_df_merged.dropna().reset_index(drop=True)

In [50]:
finetune_df_merged.to_csv("results/data/finetune_data_probs.csv", index=False)
pretrain_df_merged.to_csv("results/data/pretrain_data_probs.csv", index=False)