In [3]:
import pandas as pd
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/submission.csv')
submission.head()

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,2979577167,0.254899,0.371648,0.373453
1,2979584770,0.315672,0.306331,0.377997
2,2979732971,0.264466,0.567645,0.167889
3,2979751183,0.500031,0.217677,0.282292
4,2979854448,0.335786,0.30198,0.362234


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('vader_lexicon')

class LLMPreferenceAnalyzer:
    def __init__(self, train_path='data/train.csv', test_path='data/test.csv'):
        self.train_df = pd.read_csv(train_path)
        self.test_df = pd.read_csv(test_path)
        self.sia = SentimentIntensityAnalyzer()
    
    def preprocess_data(self):
        # Basic preprocessing
        def compute_text_features(row, model):
            response = row[f'response_{model}']
            prompt = row['prompt']
            
            # Length features
            features = {
                f'{model}_response_length': len(str(response)),
                f'{model}_word_count': len(str(response).split()),
                
                # Sentiment analysis
                f'{model}_sentiment_compound': self.sia.polarity_scores(str(response))['compound'],
                f'{model}_sentiment_pos': self.sia.polarity_scores(str(response))['pos'],
                f'{model}_sentiment_neg': self.sia.polarity_scores(str(response))['neg'],
                f'{model}_sentiment_neu': self.sia.polarity_scores(str(response))['neu'],
            }
            return pd.Series(features)
        
        # Add text features for both models
        self.train_df = pd.concat([
            self.train_df, 
            self.train_df.apply(lambda row: compute_text_features(row, 'a'), axis=1),
            self.train_df.apply(lambda row: compute_text_features(row, 'b'), axis=1)
        ], axis=1)
        
        # Encode target variable
        self.train_df['target'] = np.nan
        self.train_df.loc[self.train_df['winner_model_a'] == 1, 'target'] = 0
        self.train_df.loc[self.train_df['winner_model_b'] == 1, 'target'] = 1
        self.train_df.loc[self.train_df['winner_tie'] == 1, 'target'] = 2
        
        return self.train_df
    
    def split_data(self, test_size=0.2, random_state=42):
        # Prepare features and target
        features = [col for col in self.train_df.columns if col not in ['id', 'prompt', 'response_a', 'response_b', 'model_a', 'model_b', 
                                                                        'winner_model_a', 'winner_model_b', 'winner_tie', 'target']]
        X = self.train_df[features]
        y = self.train_df['target']
        
        # Split the data
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
        
        return X_train, X_val, y_train, y_val
    
    def explore_data(self):
        # Basic data exploration
        print("Training Data Overview:")
        print(self.train_df['target'].value_counts(normalize=True))
        
        # Correlation analysis
        print("\nFeature Correlations with Target:")
        correlation_features = [col for col in self.train_df.columns if col.startswith(('a_', 'b_'))]
        print(self.train_df[correlation_features + ['target']].corr()['target'].sort_values(ascending=False))

# Initialize and run analysis
analyzer = LLMPreferenceAnalyzer()
processed_df = analyzer.preprocess_data()
X_train, X_val, y_train, y_val = analyzer.split_data()
analyzer.explore_data()



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Training Data Overview:
target
0.0    0.349225
1.0    0.343775
2.0    0.307000
Name: proportion, dtype: float64

Feature Correlations with Target:
target                  1.000000
b_sentiment_neu         0.048594
a_sentiment_neu         0.037009
a_sentiment_neg        -0.005198
b_word_count           -0.008952
b_response_length      -0.009091
b_sentiment_neg        -0.022440
b_sentiment_compound   -0.027534
a_sentiment_pos        -0.041481
b_sentiment_pos        -0.043757
a_sentiment_compound   -0.046784
a_word_count           -0.082780
a_response_length      -0.083468
Name: target, dtype: float64


In [2]:
import pandas as pd
df = pd.read_csv("data/train.csv")
df.head()
df.columns()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
import torch
from transformers import AutoTokenizer, AutoModel

class LLMPreferenceModel:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = AutoModel.from_pretrained('bert-base-uncased').to(self.device)
        
    def extract_bert_embeddings(self, texts):
        # Encode texts with BERT
        inputs = self.tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        return outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    def feature_engineering(self, df):
        # Text length features
        df['prompt_length'] = df['prompt'].str.len()
        df['response_a_length'] = df['response_a'].str.len()
        df['response_b_length'] = df['response_b'].str.len()
        
        # BERT embeddings for prompt and responses
        df['prompt_embedding'] = list(self.extract_bert_embeddings(df['prompt']))
        df['response_a_embedding'] = list(self.extract_bert_embeddings(df['response_a']))
        df['response_b_embedding'] = list(self.extract_bert_embeddings(df['response_b']))
        
        # TF-IDF features
        tfidf = TfidfVectorizer(max_features=100)
        prompt_tfidf = tfidf.fit_transform(df['prompt']).toarray()
        
        # Combine features
        embedding_features = np.hstack([
            df['prompt_embedding'].tolist(),
            df['response_a_embedding'].tolist(),
            df['response_b_embedding'].tolist()
        ])
        
        # Combine all features
        X = np.hstack([
            embedding_features,
            prompt_tfidf,
            df[['prompt_length', 'response_a_length', 'response_b_length']]
        ])
        
        return X
    
    def prepare_target(self, df):
        # Encode target
        target = np.zeros(len(df))
        target[df['winner_model_a'] == 1] = 0
        target[df['winner_model_b'] == 1] = 1
        target[df['winner_tie'] == 1] = 2
        return target
    
    def train_model(self, X_train, y_train, X_val, y_val):
        # LightGBM parameters
        params = {
            'objective': 'multiclass',
            'num_class': 3,
            'metric': 'multi_logloss',
            'learning_rate': 0.1,
            'num_leaves': 31,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5
        }
        
        # Create datasets
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        
        # Train model
        model = lgb.train(
            params, 
            train_data, 
            valid_sets=[train_data, val_data],
            num_boost_round=100,
            early_stopping_rounds=10
        )
        
        return model
    
    def predict_probabilities(self, model, X_test):
        # Predict probabilities
        probs = model.predict(X_test)
        return probs
    
    def prepare_submission(self, test_df, probs):
        # Create submission DataFrame
        submission = pd.DataFrame({
            'id': test_df['id'],
            'winner_model_a': probs[:, 0],
            'winner_model_b': probs[:, 1],
            'winner_tie': probs[:, 2]
        })
        return submission

# Main execution
def main():
    # Load data
    train_df = pd.read_csv('data/train.csv')
    test_df = pd.read_csv('data/test.csv')
    
    # Initialize model
    llm_model = LLMPreferenceModel()
    
    # Prepare training data
    X = llm_model.feature_engineering(train_df)
    y = llm_model.prepare_target(train_df)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Train model
    model = llm_model.train_model(X_train, y_train, X_val, y_val)
    
    # Prepare test data
    X_test = llm_model.feature_engineering(test_df)
    
    # Predict probabilities
    test_probs = llm_model.predict_probabilities(model, X_test)
    
    # Create submission
    submission = llm_model.prepare_submission(test_df, test_probs)
    
    # Save submission
    submission.to_csv('submission.csv', index=False)

if __name__ == '__main__':
    main()

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

ImportError: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
import google.protobuf
print(google.protobuf.__version__)
