In [None]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path(r"C:\Users\cemzuza\Desktop\llm-classification-finetuning")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SAMPLE_SUB_PATH = DATA_DIR / "sample_submission.csv"

for p in [TRAIN_PATH, TEST_PATH, SAMPLE_SUB_PATH]:
    print(p, "==>", p.exists())

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_SUB_PATH)

print("Train shape:", train.shape)
print("Test shape :", test.shape)
print("\nKolumny train:", list(train.columns))
print("Kolumny test :", list(test.columns))

train.head(3)


C:\Users\cemzuza\Desktop\llm-classification-finetuning\train.csv ==> True
C:\Users\cemzuza\Desktop\llm-classification-finetuning\test.csv ==> True
C:\Users\cemzuza\Desktop\llm-classification-finetuning\sample_submission.csv ==> True
Train shape: (57477, 9)
Test shape : (3, 4)

Kolumny train: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']
Kolumny test : ['id', 'prompt', 'response_a', 'response_b']


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1


In [None]:
import numpy as np

target_cols = ["winner_model_a", "winner_model_b", "winner_tie"]
label_map = {"winner_model_a": "A", "winner_model_b": "B", "winner_tie": "TIE"}
y = train[target_cols].idxmax(axis=1).map(label_map)

print("Rozkład klas (proporcje):")
print(y.value_counts(normalize=True).rename("ratio"))
print()
print("Przykładowe etykiety:", y[:10].tolist())

def simple_numeric_feats(df):
    out = {}
    for side in ["a", "b"]:
        s = df[f"response_{side}"].fillna("")
        out[f"len_char_{side}"] = s.str.len()  
        out[f"len_word_{side}"] = s.str.split().str.len()  
        out[f"n_lines_{side}"]  = s.str.count("\n") + 1  
        out[f"n_code_{side}"]   = s.str.count("```") 
        out[f"n_urls_{side}"]   = s.str.count(r"http[s]?://")  
        out[f"n_q_{side}"]      = s.str.count(r"\?")  
        out[f"n_excl_{side}"]   = s.str.count(r"!")   
        out[f"n_quotes_{side}"] = s.str.count(r'"')  
    num = pd.DataFrame(out).fillna(0)
    for base in ["len_char","len_word","n_lines","n_code","n_urls","n_q","n_excl","n_quotes"]:
        num[f"diff_{base}"] = num[f"{base}_a"] - num[f"{base}_b"]
    return num
num_train = simple_numeric_feats(train)
num_test  = simple_numeric_feats(test)
print("Przykładowe cechy liczbowe:")
num_train.head(3)


Rozkład klas (proporcje):
A      0.349079
B      0.341911
TIE    0.309011
Name: ratio, dtype: float64

Przykładowe etykiety: ['A', 'B', 'TIE', 'A', 'B', 'B', 'A', 'B', 'B', 'B']
Przykładowe cechy liczbowe:


Unnamed: 0,len_char_a,len_word_a,n_lines_a,n_code_a,n_urls_a,n_q_a,n_excl_a,n_quotes_a,len_char_b,len_word_b,...,n_excl_b,n_quotes_b,diff_len_char,diff_len_word,diff_n_lines,diff_n_code,diff_n_urls,diff_n_q,diff_n_excl,diff_n_quotes
0,4538,656,1,0,0,1,5,16,1206,204,...,2,4,3332,452,0,0,0,1,3,12
1,3114,531,1,0,0,0,0,6,3649,571,...,0,6,-535,-40,0,0,0,0,0,0
2,921,138,1,2,0,0,2,20,1835,280,...,0,14,-914,-142,0,-2,0,0,2,6


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

tfidf_word = TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    ngram_range=(1, 2),
    max_features=10_000,
    min_df=2
)

tfidf_char = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    max_features=10_000,
    min_df=2
)

def tfidf_fit_transform(col_train, col_test):
    """
    Dla jednej kolumny tekstowej:
    - dopasuj TF-IDF na train,
    - przetransformuj train i test,
    - złącz WORD + CHAR w jeden blok cech (hstack).
    """
    Xw_tr = tfidf_word.fit_transform(col_train)
    Xw_te = tfidf_word.transform(col_test)
    Xc_tr = tfidf_char.fit_transform(col_train)
    Xc_te = tfidf_char.transform(col_test)
    X_tr = sparse.hstack([Xw_tr, Xc_tr], format="csr")
    X_te = sparse.hstack([Xw_te, Xc_te], format="csr")
    return X_tr, X_te

text_cols = ["prompt", "response_a", "response_b"]
X_blocks_tr, X_blocks_te = [], []

for c in text_cols:
    col_tr = train[c].fillna("") 
    col_te = test[c].fillna("")
    Xtr, Xte = tfidf_fit_transform(col_tr, col_te)
    X_blocks_tr.append(Xtr)
    X_blocks_te.append(Xte)

X_text_tr = sparse.hstack(X_blocks_tr, format="csr")
X_text_te = sparse.hstack(X_blocks_te, format="csr")

print("TF-IDF train shape:", X_text_tr.shape)
print("TF-IDF test  shape:", X_text_te.shape)


TF-IDF train shape: (57477, 60000)
TF-IDF test  shape: (3, 60000)


In [None]:
from scipy import sparse
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

assert 'X_text_tr' in globals() and 'X_text_te' in globals(), "Najpierw uruchom Krok 3 (TF-IDF)."
assert 'num_train' in globals() and 'num_test' in globals(), "Brakuje cech liczbowych (Krok 2)."
assert 'y' in globals(), "Brakuje y (etykiet) z Kroku 2."

if not sparse.issparse(X_text_tr): X_text_tr = sparse.csr_matrix(X_text_tr)
if not sparse.issparse(X_text_te): X_text_te = sparse.csr_matrix(X_text_te)

scaler = StandardScaler(with_mean=False) 
X_num_tr = scaler.fit_transform(num_train.values.astype(float))
X_num_te = scaler.transform(num_test.values.astype(float))

X_tr_all = sparse.hstack([X_text_tr, X_num_tr], format="csr")
X_te_all = sparse.hstack([X_text_te, X_num_te], format="csr")

print("FULL train shape:", X_tr_all.shape, "| FULL test shape:", X_te_all.shape)
print("TF-IDF feats:", X_text_tr.shape[1], "| numeric feats:", X_num_tr.shape[1], "| total:", X_tr_all.shape[1])

classes = np.array(["A","B","TIE"])
y_idx = pd.Categorical(y, categories=classes).codes
print("Klasy (kody):", np.unique(y_idx), "↔", classes.tolist())


FULL train shape: (57477, 60024) | FULL test shape: (3, 60024)
TF-IDF feats: 60000 | numeric feats: 24 | total: 60024
Klasy (kody): [0 1 2] ↔ ['A', 'B', 'TIE']


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

assert 'X_tr_all' in globals(), "Brakuje X_tr_all (uruchom Krok 4: łączenie TF-IDF + cechy liczbowe)."
assert 'y' in globals(), "Brakuje y (uruchom Krok 2: etykiety A/B/TIE)."

classes = np.array(["A","B","TIE"])
y_idx = pd.Categorical(y, categories=classes).codes

N_FOLDS  = 3          
SOLVER   = "lbfgs"    
C_VALUE  = 2.0        
MAX_ITER = 800        
USE_BALANCED = False
# ---------------------------------------------------

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_proba  = np.zeros((X_tr_all.shape[0], 3), dtype=float)
fold_scores = []

print(f"Start {N_FOLDS}-fold walidacji (solver={SOLVER}, C={C_VALUE}, max_iter={MAX_ITER})...")
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_tr_all, y_idx), 1):
    X_tr, X_va = X_tr_all[tr_idx], X_tr_all[va_idx]
    y_tr, y_va = y_idx[tr_idx], y_idx[va_idx]

    clf = LogisticRegression(
        solver=SOLVER,
        max_iter=MAX_ITER,
        C=C_VALUE,
        random_state=fold,
        class_weight="balanced" if USE_BALANCED else None
    )

    clf.fit(X_tr, y_tr)
    pr = clf.predict_proba(X_va)
    ll = log_loss(y_va, pr, labels=[0,1,2])

    oof_proba[va_idx] = pr
    fold_scores.append(ll)
    print(f"Fold {fold}: log loss = {ll:.6f}")

oof_ll = log_loss(y_idx, oof_proba, labels=[0,1,2])
print("\nFold scores:", [f"{s:.6f}" for s in fold_scores])
print(f"OOF log loss: {oof_ll:.6f}")


Start 3-fold walidacji (solver=lbfgs, C=2.0, max_iter=800)...
Fold 1: log loss = 1.291410
Fold 2: log loss = 1.291325
Fold 3: log loss = 1.315543

Fold scores: ['1.291410', '1.291325', '1.315543']
OOF log loss: 1.299426


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

assert 'X_tr_all' in globals(), "Brakuje X_tr_all (uruchom Krok 4)."
assert 'y' in globals(), "Brakuje y (uruchom Krok 2: etykiety A/B/TIE)."

classes = np.array(["A", "B", "TIE"])
y_idx = pd.Categorical(y, categories=classes).codes

final_clf = LogisticRegression(
    solver="lbfgs",   
    max_iter=800,     
    C=2.0,           
    random_state=123
)

print("Start treningu na CAŁYM train...")
final_clf.fit(X_tr_all, y_idx)
print("Trening zakończony.")


Start treningu na CAŁYM train...
Trening zakończony.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import numpy as np
import pandas as pd

assert 'X_te_all' in globals(), "Brakuje X_te_all (uruchom Krok 4)."
assert 'test' in globals(), "Brakuje DataFrame 'test' (Krok 1)."
assert 'final_clf' in globals(), "Brakuje final_clf (uruchom Krok 6A)."

classes = np.array(["A", "B", "TIE"])
test_proba = final_clf.predict_proba(X_te_all)

sub = pd.DataFrame({
    "id": test["id"].values,
    "winner_model_a": test_proba[:, classes.tolist().index("A")],
    "winner_model_b": test_proba[:, classes.tolist().index("B")],
    "winner_tie":    test_proba[:, classes.tolist().index("TIE")],
})

row_sums = sub[["winner_model_a","winner_model_b","winner_tie"]].sum(axis=1)
print("Średnia suma wiersza (powinno być ≈ 1.0):", float(row_sums.mean()))
print("NaN w submission?:", sub.isna().any().to_dict())

out_path = "submission.csv"
sub.to_csv(out_path, index=False)
print("Zapisano:", out_path)
sub.head()



Średnia suma wiersza (powinno być ≈ 1.0): 1.0
NaN w submission?: {'id': False, 'winner_model_a': False, 'winner_model_b': False, 'winner_tie': False}
Zapisano: submission.csv


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.081277,0.327808,0.590916
1,211333,0.817818,0.121692,0.06049
2,1233961,0.509131,0.257815,0.233054
