In [1]:
import pandas as pd
import numpy as np
import re
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

def extract_code_features(df):
    df['char_len'] = df['code'].str.len()
    df['word_count'] = df['code'].apply(lambda x: len(x.split()))
    df['sentence_count'] = df['code'].apply(lambda x: x.count('.') + x.count('\n'))
    df['avg_token_len'] = df['code'].apply(lambda x: np.mean([len(w) for w in x.split()]) if x.split() else 0)
    df['num_digits'] = df['code'].apply(lambda x: sum(c.isdigit() for c in x))
    df['num_loops'] = df['code'].str.count(r'\bfor\b|\bwhile\b')
    df['num_if'] = df['code'].str.count(r'\bif\b')
    df['num_functions'] = df['code'].str.count(r'\bdef\b|\bfunction\b|\bvoid\b')
    df['num_comments'] = df['code'].str.count(r'#|//|/\*|\*/')
    df['has_O_complexity'] = df['code'].str.contains(r'O\([^)]+\)', flags=re.IGNORECASE).astype(int)
    df['contains_tree'] = df['code'].str.contains(r'\btree\b', flags=re.IGNORECASE).astype(int)
    df['contains_dp'] = df['code'].str.contains(r'dynamic programming|\bdp\b', flags=re.IGNORECASE).astype(int)
    df['contains_hash'] = df['code'].str.contains(r'\bhash', flags=re.IGNORECASE).astype(int)
    df['contains_stack'] = df['code'].str.contains(r'\bstack\b', flags=re.IGNORECASE).astype(int)
    df['contains_recursive'] = df['code'].str.contains(r'\brecursive|\brecursion', flags=re.IGNORECASE).astype(int)
    df['has_algo_steps'] = df['code'].str.contains(r'^\s*\d+\.', flags=re.M).astype(int)
    return df

# === 1. Предобработка ===
sample_submission = pd.read_csv('/kaggle/input/kz-tst-day-3/sample_submission.csv')
train = pd.read_csv('/kaggle/input/kz-tst-day-3/train.csv')
test = pd.read_csv('/kaggle/input/kz-tst-day-3/test.csv')

train['code'] = train['code'].fillna('').astype(str).str.strip()
train = train[train['code'].str.len() > 0]
train = train.dropna(subset=['difficulty'])
train['difficulty'] = train['difficulty'].map({'easy': 0, 'medium': 1, 'hard': 2})

# Сохраним копию train до oversampling для оценки
train_orig = train.copy()

# === 2. Балансировка классов через oversampling ===
max_count = train['difficulty'].value_counts().max()
train_bal = pd.concat([
    train[train['difficulty'] == 0].sample(max_count, replace=True, random_state=42),
    train[train['difficulty'] == 1].sample(max_count, replace=True, random_state=42),
    train[train['difficulty'] == 2].sample(max_count, replace=True, random_state=42)
])

# === 3. Извлечение признаков ===
train_bal = extract_code_features(train_bal)
feature_cols = [
    'char_len', 'word_count', 'sentence_count', 'avg_token_len', 'num_digits',
    'num_loops', 'num_if', 'num_functions', 'num_comments',
    'has_O_complexity', 'contains_tree', 'contains_dp',
    'contains_hash', 'contains_stack', 'contains_recursive', 'has_algo_steps'
]
X_meta = train_bal[feature_cols].values

# === 4. TF-IDF ===
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b\w+\b"
)
X_tfidf = tfidf.fit_transform(train_bal['code'])

# === 5. Объединяем ===
X_full = hstack([X_tfidf, csr_matrix(X_meta)])
y = train_bal['difficulty']

# === 6. Train / Validation + обучение ===
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y, test_size=0.1, random_state=42, stratify=y
)
model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

# === 7. Оценка ===
y_pred = model.predict(X_val)
print('\n[Validation Classification Report on Oversampled Data]')
print(classification_report(y_val, y_pred, target_names=['easy', 'medium', 'hard']))

# === 8. Оценка на оригинальном train (до oversampling) ===
train_orig = extract_code_features(train_orig)
X_orig_meta = train_orig[feature_cols].values
X_orig_tfidf = tfidf.transform(train_orig['code'])
X_orig_full = hstack([X_orig_tfidf, csr_matrix(X_orig_meta)])
y_orig = train_orig['difficulty']
y_orig_pred = model.predict(X_orig_full)

print('\n[Accuracy on Original Train (No Oversampling)]:', accuracy_score(y_orig, y_orig_pred))
print('[Classification Report on Original Train]')
print(classification_report(y_orig, y_orig_pred, target_names=['easy', 'medium', 'hard']))

# === 9. Предсказание на test ===
test['code'] = test['code'].fillna('').astype(str).str.strip()
test = extract_code_features(test)
X_test_meta = test[feature_cols].values
X_test_tfidf = tfidf.transform(test['code'])
X_test_full = hstack([X_test_tfidf, csr_matrix(X_test_meta)])
test_preds = model.predict(X_test_full)

# === 10. Сабмишен ===
difficulty_map = {0: 'easy', 1: 'medium', 2: 'hard'}
sample_submission['difficulty'] = pd.Series(test_preds).map(difficulty_map)
sample_submission.to_csv('submission.csv', index=False)

# === 11. Просмотр ===
print('\n[Test Prediction Distribution]')
print(sample_submission['difficulty'].value_counts())
print('\n[Sample Submission Preview]')
print(sample_submission.head())


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82960
[LightGBM] [Info] Number of data points in the train set: 2767, number of used features: 3095
[LightGBM] [Info] Start training from score -1.097890
[LightGBM] [Info] Start training from score -1.098974
[LightGBM] [Info] Start training from score -1.098974

[Validation Classification Report on Oversampled Data]
              precision    recall  f1-score   support

        easy       0.89      0.93      0.91       102
      medium       0.90      0.80      0.85       103
        hard       0.88      0.94      0.91       103

    accuracy                           0.89       308
   macro avg       0.89      0.89      0.89       308
weighted avg       0.89      0.89      0.89       308


[Accuracy on Original Train (No Oversampling)]: 0.8553259141494436
[Classification Report on Original Train]
