In [1]:
import pandas as pd
import numpy as np
import zipfile as zf

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pickle

from sklearn.model_selection import train_test_split, PredefinedSplit
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN

from sklearn.metrics import f1_score

# import torch
# from ultralytics import YOLO

from tqdm.notebook import tqdm
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv(r'C:\Users\ffedo\Desktop\data science\data\1_rucode\disease\disease_train.csv')
df_train

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,Y
0,2.0,2.0,29.4,84.0,2.0,75.0,5.78,0
1,1.0,2.0,33.5,101.0,2.0,64.0,8.74,0
2,1.0,2.0,29.2,101.0,2.0,80.0,9.72,0
3,1.0,2.0,28.7,121.0,2.0,146.0,3.83,0
4,1.0,1.0,25.6,87.0,2.0,82.0,13.90,0
...,...,...,...,...,...,...,...,...
1590,1.0,2.0,40.5,99.0,2.0,97.0,18.17,0
1591,2.0,2.0,20.1,92.0,2.0,81.0,7.77,0
1592,1.0,1.0,31.9,90.0,2.0,89.0,37.67,0
1593,1.0,2.0,29.8,100.0,2.0,69.0,11.11,0


In [3]:
# lmean = []
# lsum = []
# lmin = []
# lmax = []
# lmed = []

# for i in tqdm(range(len(df_train))):
#     lmean.append(df_train.loc[i][:7].mean())
#     lsum.append(df_train.loc[i][:7].sum())
#     lmin.append(df_train.loc[i][:7].min())
#     lmax.append(df_train.loc[i][:7].max())
#     lmed.append(df_train.loc[i][:7].median())

# df_train['mean'] = lmean
# df_train['sum'] = lsum
# df_train['min'] = lmin
# df_train['max'] = lmax
# df_train['med'] = lmed

# df_train

In [4]:
X = df_train.drop(['Y'], axis=1) # выбор признаков
X

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7
0,2.0,2.0,29.4,84.0,2.0,75.0,5.78
1,1.0,2.0,33.5,101.0,2.0,64.0,8.74
2,1.0,2.0,29.2,101.0,2.0,80.0,9.72
3,1.0,2.0,28.7,121.0,2.0,146.0,3.83
4,1.0,1.0,25.6,87.0,2.0,82.0,13.90
...,...,...,...,...,...,...,...
1590,1.0,2.0,40.5,99.0,2.0,97.0,18.17
1591,2.0,2.0,20.1,92.0,2.0,81.0,7.77
1592,1.0,1.0,31.9,90.0,2.0,89.0,37.67
1593,1.0,2.0,29.8,100.0,2.0,69.0,11.11


In [5]:
y = df_train['Y'] # выбор целевого признака
y

0       0
1       0
2       0
3       0
4       0
       ..
1590    0
1591    0
1592    0
1593    0
1594    0
Name: Y, Length: 1595, dtype: int64

In [6]:
# smote = SMOTE()
# X_resampled, y_resampled = smote.fit_resample(X, y)
# X_resampled

In [7]:
# ros = RandomOverSampler()
# X_resampled, y_resampled = ros.fit_resample(X, y)
# X_resampled

In [20]:
# adasyn = ADASYN()
# X_resampled, y_resampled = adasyn.fit_resample(X, y)
# X_resampled

In [9]:
# X,y = X_resampled, y_resampled

In [10]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights

array([1.01755041, 0.98304473])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [12]:
ps = PredefinedSplit(test_fold=[-1 if i in X_train.index else 0 for i in X.index])
ps

PredefinedSplit(test_fold=array([ 0, -1, ..., -1, -1]))

In [13]:
rfc_rs = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators': [20, 50, 100, 200],
        'max_depth': [7, 8, 10, 20, None],
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'min_samples_split': [1, 2, 5, 10],
        'n_jobs': [-1],
        'class_weight': [dict(enumerate(class_weights))]
    },
    scoring='f1',
    verbose=52,
    cv=ps
)
rfc_rs.fit(X, y)

Fitting 1 folds for each of 720 candidates, totalling 720 fits
[CV 1/1; 1/720] START class_weight={0: 1.0175504107542943, 1: 0.983044733044733}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1
[CV 1/1; 1/720] END class_weight={0: 1.0175504107542943, 1: 0.983044733044733}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1;, score=nan total time=   0.0s
[CV 1/1; 2/720] START class_weight={0: 1.0175504107542943, 1: 0.983044733044733}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=50, n_jobs=-1
[CV 1/1; 2/720] END class_weight={0: 1.0175504107542943, 1: 0.983044733044733}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=50, n_jobs=-1;, score=nan total time=   0.0s
[CV 1/1; 3/720] START class_weight={0: 1.0175504107542943, 1: 0.983044733044733}, criterion=gini, max_depth=7, max_features=sqrt, min_samples_split=1, n_estimators=100

180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\L

In [14]:
f1_score(y_test, rfc_rs.predict(X_test), average='macro')

0.9963362097888959

In [15]:
df_test = pd.read_csv(r'C:\Users\ffedo\Desktop\data science\data\1_rucode\disease\disease_public_test.csv')
df_test

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7
0,1.0,2.0,27.1,102.0,2.0,130.0,12.08
1,2.0,2.0,26.1,84.0,2.0,77.0,3.62
2,1.0,2.0,20.5,100.0,2.0,100.0,13.18
3,1.0,2.0,33.5,97.0,2.0,105.0,13.61
4,1.0,2.0,24.4,106.0,2.0,104.0,8.18
...,...,...,...,...,...,...,...
337,1.0,2.0,27.0,93.0,2.0,85.0,8.23
338,2.0,2.0,23.0,96.0,2.0,129.0,6.80
339,1.0,1.0,28.2,92.0,2.0,74.0,6.29
340,2.0,2.0,30.4,109.0,2.0,124.0,8.83


In [16]:
# lmean = []
# lsum = []
# lmin = []
# lmax = []
# lmed = []

# for i in tqdm(range(len(df_test))):
#     lmean.append(df_test.loc[i][:7].mean())
#     lsum.append(df_test.loc[i][:7].sum())
#     lmin.append(df_test.loc[i][:7].min())
#     lmax.append(df_test.loc[i][:7].max())
#     lmed.append(df_test.loc[i][:7].median())

# df_test['mean'] = lmean
# df_test['sum'] = lsum
# df_test['min'] = lmin
# df_test['max'] = lmax
# df_test['med'] = lmed

# df_test

In [17]:
preds = rfc_rs.predict(df_test)
preds

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [18]:
df_test['Y'] = preds
#df_test =df_test.drop(['mean','min','sum','max','med'], axis = 1)
df_test

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,Y
0,1.0,2.0,27.1,102.0,2.0,130.0,12.08,0
1,2.0,2.0,26.1,84.0,2.0,77.0,3.62,1
2,1.0,2.0,20.5,100.0,2.0,100.0,13.18,0
3,1.0,2.0,33.5,97.0,2.0,105.0,13.61,0
4,1.0,2.0,24.4,106.0,2.0,104.0,8.18,1
...,...,...,...,...,...,...,...,...
337,1.0,2.0,27.0,93.0,2.0,85.0,8.23,0
338,2.0,2.0,23.0,96.0,2.0,129.0,6.80,0
339,1.0,1.0,28.2,92.0,2.0,74.0,6.29,0
340,2.0,2.0,30.4,109.0,2.0,124.0,8.83,1


In [19]:
df_test.to_csv(r'C:\Users\ffedo\Desktop\data science\data\sub3.csv')