In [None]:
import huggingface_hub
from datasets import load_dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

import os
import wandb
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [None]:
wandb.init(project='logistic_probes_activations')

In [None]:
#model_name = 'pythia_70m'

model_names = ['pythia_70m', 'pythia_160m', 'gpt_neo_125m']#, 'gemma_2b_it']
tasks = ['unaligned', 'hh_rlhf']
task_name = tasks[1]
model_type = 'rlhf'


template_name = 'data/merged_contrastive_{model_name}_from_model_{model_type}_on_task_{task_name}_activations_dataset.hf'

In [None]:
from datasets import DatasetDict
from huggingface_hub import HfApi, hf_hub_download
from reward_analyzer.configs.project_configs import HuggingfaceConfig

import os
import pickle
import shutil

In [None]:
# Convert the dictionary to DataFrame
def dict_to_df(data_dict, label):
    print(f'Converting {label} to pandas')
    df = pd.DataFrame(data_dict)
    df['label'] = label
    return df

In [None]:
def download_folder_from_hub(folder_path: str, config=HuggingfaceConfig()):
    api = HfApi()
    repo_id = config.repo_id
    contents = api.list_repo_files(repo_id)
    folder_contents = [file for file in contents if file.startswith(folder_path)]
    print(folder_contents)

    for filename in folder_contents:
        if filename.startswith(folder_path):
            print(filename)
            filepath = hf_hub_download(repo_id=repo_id, filename=filename, force_download=True)
            download_dir = "/".join(filename.split("/")[:-1])
            print(f'{download_dir} for {filename}')

            download_dir = os.path.join(os.getcwd(), download_dir)

            # Ensure the directory exists
            os.makedirs(download_dir, exist_ok=True)
            shutil.copy(filepath, download_dir)

In [None]:
def run_logistic_regression(model_name):
    one_template = template_name.format(model_name=model_name, model_type='rlhf', task_name=tasks[1])
    print(one_template)
    download_folder_from_hub(one_template)
    dataset_dict = DatasetDict.load_from_disk(one_template)

    try:
        chosen_df = dict_to_df(dataset_dict['chosen'], 'chosen')
        rejected_df = dict_to_df(dataset_dict['new_rejected'], 'new_rejected')
        
        # Concatenate the chosen and rejected DataFrames
        df = pd.concat([chosen_df, rejected_df])
        df = df.drop_duplicates(subset=['texts'])
        
        print(df.columns)
        
        # Compute concatenated_activations feature
        activation_columns = [col for col in df.columns if col.startswith('averaged_activations_')]

        print(f'We have {len(df)} columns left after dropping texts')
        
        df['concatenated_activations'] = df[activation_columns].apply(lambda row: np.concatenate(row.values), axis=1)

        
        # Split the data into features and labels
        X = np.vstack(df['concatenated_activations'].values)
        y = df['label'].apply(lambda x: 1 if x == 'chosen' else 0).values
        
        # Split into training and test sets
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        print(f'X_train is of shape {X_train.shape}')
        
        # Train logistic regression model
        model = LogisticRegression()
        model.fit(X_train, y_train)

        with open(f'logistic_regression_{model_name}.pickle', 'wb') as f_out:
            pickle.dump(model)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        print(f'Accuracy was {accuracy} for {model_name}')

        wandb.run.summary[f'{model_name}_accuracy'] = accuracy
    except Exception as e:
        return {'error': str(e)}


In [None]:
model_names = ['pythia_70m', 'pythia_160m', 'gpt_neo_125m']
for model_name in model_names:
    results = run_logistic_regression(model_name)
    print(f'Results were {results}')