## 1. Импорт и поиск данных

In [1]:
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd
import matplotlib .pyplot as plt

from sklearn .pipeline import Pipeline
from sklearn .preprocessing import StandardScaler ,MinMaxScaler
from sklearn .linear_model import LogisticRegression
from sklearn .neighbors import KNeighborsClassifier
from sklearn .ensemble import RandomForestClassifier
from sklearn .metrics import accuracy_score ,precision_score ,recall_score ,f1_score ,roc_auc_score
from sklearn .preprocessing import label_binarize

RANDOM_STATE =42

def find_file (patterns :List [str ],search_dirs :List [str ]):
    for d in search_dirs :
        d =Path (d )
        for pat in patterns :
            for p in d .glob (pat ):
                if p .is_file ():
                    return p
    return None

SEARCH_DIRS =['data','.','/mnt/data','/content','/kaggle/working','/kaggle/input']
TRAIN_PATS =[
'fashion-mnist_train.csv.zip','fashion_mnist_train.csv.zip',
'fashion-mnist_train.csv','fashion_mnist_train.csv',
'**/fashion*mnist*train*.csv*',
]
TEST_PATS =[
'fashion-mnist_test.csv.zip','fashion_mnist_test.csv.zip',
'fashion-mnist_test.csv','fashion_mnist_test.csv',
'**/fashion*mnist*test*.csv*',
]

TRAIN_PATH =find_file (TRAIN_PATS ,SEARCH_DIRS )
TEST_PATH =find_file (TEST_PATS ,SEARCH_DIRS )
if TRAIN_PATH is None or TEST_PATH is None :
    raise FileNotFoundError ('Положите train/test в папку data/ рядом с ноутбуком.')
print ('Использую файлы:\nTRAIN:',TRAIN_PATH ,'\nTEST:',TEST_PATH )

train_df =pd .read_csv (TRAIN_PATH )
test_df =pd .read_csv (TEST_PATH )
print ('Исходные формы:',train_df .shape ,test_df .shape )

FileNotFoundError: Положите train/test в папку data/ рядом с ноутбуком.

## 2. Фикс‑сплит `:50000 / 50000:` и выравнивание признаков

In [None]:

train_fixed =train_df .iloc [:50000 ].reset_index (drop =True )
val_fixed =train_df .iloc [50000 :].reset_index (drop =True )


feature_cols =[c for c in train_df .columns if c !='label']


drop_candidates =[c for c in ['label','id','index','Unnamed: 0']if c in test_df .columns ]
if drop_candidates :
    test_df =test_df .drop (columns =drop_candidates )
extra_cols =[c for c in test_df .columns if c not in feature_cols ]
if extra_cols :
    test_df =test_df .drop (columns =extra_cols )
if len (test_df .columns )!=len (feature_cols ):
    test_df =test_df .iloc [:,:len (feature_cols )]

X_tr =train_fixed [feature_cols ].astype (np .float32 ).values
y_tr =train_fixed ['label'].values
X_val =val_fixed [feature_cols ].astype (np .float32 ).values
y_val =val_fixed ['label'].values
X_test =(test_df [feature_cols ].astype (np .float32 ,errors ='ignore').values
if set (feature_cols ).issubset (test_df .columns )else test_df .astype (np .float32 ).values )


if X_tr .max ()>1.0 :
    X_tr =X_tr /255.0
    X_val =X_val /255.0
    X_test =X_test /255.0

print ('Shapes after align:',X_tr .shape ,X_val .shape ,X_test .shape )

## 3. Визуализация: по одному примеру на класс

In [None]:
pixels =feature_cols
side =int (np .sqrt (len (pixels )))if len (pixels )else 28
plt .figure (figsize =(12 ,3 ))
classes_sorted =sorted (train_df ['label'].unique ())
for i ,cls in enumerate (classes_sorted ):
    ex =train_df [train_df ['label']==cls ].iloc [0 ]
    img =ex [pixels ].values .reshape (side ,side )
    ax =plt .subplot (2 ,len (classes_sorted )//2 +len (classes_sorted )%2 ,i +1 )
    ax .imshow (img ,cmap ='gray');ax .axis ('off');ax .set_title (str (cls ))
plt .suptitle ('Один пример на класс');plt .tight_layout ();plt .show ()

## 4. Метрики (Accuracy / Precision / Recall / F1‑macro / ROC‑AUC macro/micro)

In [None]:
def multiclass_metrics (y_true ,y_pred ,y_proba =None ,classes =None ):
    res ={
    'accuracy':accuracy_score (y_true ,y_pred ),
    'precision_macro':precision_score (y_true ,y_pred ,average ='macro',zero_division =0 ),
    'recall_macro':recall_score (y_true ,y_pred ,average ='macro',zero_division =0 ),
    'f1_macro':f1_score (y_true ,y_pred ,average ='macro',zero_division =0 ),
    }
    if y_proba is not None and classes is not None :
        Y_true_bin =label_binarize (y_true ,classes =classes )
        try :
            res ['roc_auc_macro']=roc_auc_score (Y_true_bin ,y_proba ,average ='macro',multi_class ='ovr')
            res ['roc_auc_micro']=roc_auc_score (Y_true_bin ,y_proba ,average ='micro',multi_class ='ovr')
        except Exception :
            pass
    return res

## 5. Сравнение скейлеров × моделей (LR/KNN/RF)

In [None]:
models ={
'LR':LogisticRegression (solver ='lbfgs',multi_class ='multinomial',max_iter =500 ,tol =1e-3 ),
'KNN(5)':KNeighborsClassifier (n_neighbors =5 ,n_jobs =-1 ),
'RF':RandomForestClassifier (n_estimators =150 ,max_depth =20 ,n_jobs =-1 ,random_state =RANDOM_STATE ),
}
scalers ={
'StandardScaler':StandardScaler (),
'MinMaxScaler':MinMaxScaler (),
}

rows =[]
classes_sorted =sorted (np .unique (y_tr ))
for sc_name ,scaler in scalers .items ():
    for mdl_name ,mdl in models .items ():
        pipe =Pipeline ([('scaler',scaler ),('clf',mdl )])
        pipe .fit (X_tr ,y_tr )
        y_pred =pipe .predict (X_val )
        y_proba =None
        if hasattr (pipe ,'predict_proba')or hasattr (pipe [-1 ],'predict_proba'):
            try :
                y_proba =pipe .predict_proba (X_val )
            except Exception :
                y_proba =None
        m =multiclass_metrics (y_val ,y_pred ,y_proba ,classes_sorted )
        rows .append ({'scaler':sc_name ,'model':mdl_name ,**m })

compare_df =pd .DataFrame (rows ).sort_values (['f1_macro','accuracy'],ascending =False ).reset_index (drop =True )
compare_df

## 6. Подбор `k` для KNN (StandardScaler)

In [None]:
ks =[1 ,3 ,5 ,7 ,9 ,11 ]
scaler =StandardScaler ()
Xtr_s =scaler .fit_transform (X_tr )
Xval_s =scaler .transform (X_val )
rows =[]
for k in ks :
    knn =KNeighborsClassifier (n_neighbors =k ,n_jobs =-1 )
    knn .fit (Xtr_s ,y_tr )
    yp =knn .predict (Xval_s )
    y_proba =None
    try :
        y_proba =knn .predict_proba (Xval_s )
    except Exception :
        pass
    m =multiclass_metrics (y_val ,yp ,y_proba ,classes_sorted )
    rows .append ({'k':k ,**m })
knn_df =pd .DataFrame (rows ).sort_values (['f1_macro','accuracy'],ascending =False ).reset_index (drop =True )
knn_df

## 7. Лучшая связка → обучение → сабмит CSV

In [None]:
best_row =compare_df .iloc [0 ]
print ('Лучшая связка по валидации:',best_row ['scaler'],'+',best_row ['model'])

best_scaler =StandardScaler ()if best_row ['scaler']=='StandardScaler'else MinMaxScaler ()
if best_row ['model'].startswith ('LR'):
    best_model =LogisticRegression (solver ='lbfgs',multi_class ='multinomial',max_iter =500 ,tol =1e-3 )
elif best_row ['model'].startswith ('KNN'):
    best_model =KNeighborsClassifier (n_neighbors =5 ,n_jobs =-1 )
else :
    best_model =RandomForestClassifier (n_estimators =200 ,max_depth =20 ,n_jobs =-1 ,random_state =RANDOM_STATE )

final_pipe =Pipeline ([('scaler',best_scaler ),('clf',best_model )])
final_pipe .fit (X_tr ,y_tr )
test_pred =final_pipe .predict (X_test ).astype (int )

submit =pd .DataFrame ({'id':np .arange (len (test_pred )),'label':test_pred })
submit .to_csv ('fashion_mnist_predictions.csv',index =False ,encoding ='utf-8')
print ('Сохранён файл: fashion_mnist_predictions.csv')


compare_df .to_csv ('compare_models.csv',index =False ,encoding ='utf-8')
knn_df .to_csv ('knn_sweep.csv',index =False ,encoding ='utf-8')
print ('Сохранены: compare_models.csv, knn_sweep.csv')

## 8. Теория — реализации

In [None]:
from collections import Counter

EPS =1e-12
def sigmoid (z :np .ndarray )->np .ndarray :
    return 1.0 /(1.0 +np .exp (-z ))

def logistic_loss_binary (y_true :np .ndarray ,y_prob :np .ndarray )->float :
    y_prob =np .clip (y_prob ,EPS ,1.0 -EPS )
    loss =-(y_true *np .log (y_prob )+(1 -y_true )*np .log (1 -y_prob ))
    return float (np .mean (loss ))

def logistic_grad_binary (X :np .ndarray ,y :np .ndarray ,w :np .ndarray ,b :float =0.0 ):
    z =X @w +b
    p =sigmoid (z )
    diff =(p -y )
    grad_w =(X .T @diff )/X .shape [0 ]
    grad_b =float (np .mean (diff ))
    return grad_w ,grad_b

class LogisticRegressionScratch :
    def __init__ (self ,lr :float =0.1 ,n_epochs :int =60 ,reg :float =1e-4 ,random_state :int =42 ):
        self .lr =lr ;self .n_epochs =n_epochs ;self .reg =reg ;self .random_state =random_state
        self .w =None ;self .b =None ;self .classes_ =None ;self .loss_curve_ =[]
    def _fit_binary (self ,X ,y ):
        rng =np .random .default_rng (self .random_state )
        w =rng .normal (0 ,0.01 ,size =X .shape [1 ]);b =0.0
        for _ in range (self .n_epochs ):
            gw ,gb =logistic_grad_binary (X ,y ,w ,b )
            gw +=self .reg *w
            w -=self .lr *gw ;b -=self .lr *gb
            p =sigmoid (X @w +b )
            loss =logistic_loss_binary (y ,p )+0.5 *self .reg *float (np .sum (w *w ))
            self .loss_curve_ .append (loss )
        return w ,b
    def fit (self ,X ,y ):
        X =np .asarray (X ,dtype =float );y =np .asarray (y );self .classes_ =np .unique (y )
        self .w =np .zeros ((len (self .classes_ ),X .shape [1 ]));self .b =np .zeros (len (self .classes_ ));self .loss_curve_ .clear ()
        for i ,cls in enumerate (self .classes_ ):
            y_bin =(y ==cls ).astype (float )
            w_i ,b_i =self ._fit_binary (X ,y_bin )
            self .w [i ]=w_i ;self .b [i ]=b_i
        return self
    def decision_function (self ,X ):return X @self .w .T +self .b
    def predict_proba (self ,X ):
        probs =sigmoid (self .decision_function (X ))
        probs =probs /np .clip (probs .sum (axis =1 ,keepdims =True ),EPS ,None )
        return probs
    def predict (self ,X ):
        scores =self .decision_function (X );idx =np .argmax (scores ,axis =1 );return self .classes_ [idx ]

class PerceptronScratch :
    def __init__ (self ,lr =0.001 ,n_epochs =5 ,random_state =42 ):
        self .lr =lr ;self .n_epochs =n_epochs ;self .random_state =random_state
        self .w =None ;self .b =None ;self .classes_ =None
    def fit (self ,X ,y ):
        X =np .asarray (X ,dtype =float );y =np .asarray (y );self .classes_ =np .unique (y )
        self .w =np .zeros ((len (self .classes_ ),X .shape [1 ]));self .b =np .zeros (len (self .classes_ ))
        for i ,cls in enumerate (self .classes_ ):
            y_bin =np .where (y ==cls ,1.0 ,-1.0 );w =np .zeros (X .shape [1 ]);b =0.0
            for _ in range (self .n_epochs ):
                for xi ,yi in zip (X ,y_bin ):
                    if yi *(np .dot (w ,xi )+b )<=0 :
                        w +=self .lr *yi *xi ;b +=self .lr *yi
            self .w [i ]=w ;self .b [i ]=b
        return self
    def decision_function (self ,X ):return np .asarray (X ,dtype =float )@self .w .T +self .b
    def predict (self ,X ):
        scores =self .decision_function (X );idx =np .argmax (scores ,axis =1 );return self .classes_ [idx ]

class KNNClassifierScratch :
    def __init__ (self ,n_neighbors =5 ):self .k =n_neighbors ;self .X =None ;self .y =None
    def fit (self ,X ,y ):self .X =np .asarray (X ,dtype =float );self .y =np .asarray (y );return self
    def predict (self ,X ):
        X =np .asarray (X ,dtype =float );preds =[]
        for x in X :
            dists =np .sqrt (((self .X -x )**2 ).sum (axis =1 ))
            idx =np .argpartition (dists ,self .k )[:self .k ]
            votes =self .y [idx ];preds .append (Counter (votes ).most_common (1 )[0 ][0 ])
        return np .array (preds )

### 9. Метрики для реализаций

In [None]:

lrs =LogisticRegressionScratch (lr =0.1 ,n_epochs =60 ,reg =1e-4 ,random_state =42 )
lrs .fit (X_tr ,y_tr )
plt .figure ();plt .plot (lrs .loss_curve_ );plt .title ('LogRegScratch — Loss');plt .xlabel ('Epoch');plt .ylabel ('Loss');plt .tight_layout ();plt .show ()
y_pred =lrs .predict (X_val );y_proba =lrs .predict_proba (X_val )
print ('LRS metrics:',multiclass_metrics (y_val ,y_pred ,y_proba ,classes_sorted ))


perc =PerceptronScratch (lr =0.001 ,n_epochs =5 ).fit (X_tr ,y_tr )
yp =perc .predict (X_val )
print ('Perceptron metrics:',multiclass_metrics (y_val ,yp ,None ,classes_sorted ))


knn_s =KNNClassifierScratch (n_neighbors =5 ).fit (X_tr [:8000 ],y_tr [:8000 ])
yk =knn_s .predict (X_val [:3000 ])
print ('KNNScratch metrics (val[:3000]):',multiclass_metrics (y_val [:3000 ],yk ,None ,classes_sorted ))

**Кривая потерь и текстовые метрики\1** ![Кривая потерь и текстовые метрики\1](attachment:dad70cac-bcf5-4ee9-8ff7-40f76f5f038e.png)

**Сравнение скейлеров×моделей и подбор k для KNN.** ![Сравнение скейлеров×моделей и подбор k для KNN.](attachment:d05a993f-376d-4d2a-b01c-cf3737031eb9.png)

**Консоль: успешные формы матриц и лучшая связка.** ![Консоль: успешные формы матриц и лучшая связка.](attachment:ca811d97-27a9-4356-b0a8-4ae2af565912.png)