In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [54]:
df = pd.read_csv('heart.csv', sep=',')
print(df.head(5), "\n")
print(df.info(), "\n")

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  

In [55]:
# отбор числовых колонок (Числовые признаки)

df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(numeric_cols)
print(df_numeric)

['age' 'sex' 'cp' 'trestbps' 'chol' 'fbs' 'restecg' 'thalach' 'exang'
 'oldpeak' 'slope' 'ca' 'thal' 'target']
      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0   

In [56]:
# отбор нечисловых колонок (Категориальные признаки)

df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(non_numeric_cols)
print(df_non_numeric)

[]
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[1025 rows x 0 columns]


In [57]:
df["ca"].value_counts().sort_index()

ca
0    578
1    226
2    134
3     69
4     18
Name: count, dtype: int64

In [58]:
# Кодирование качественных признаков

# OrdinalEncoder

from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=int)
categorical_columns = categorical_columns_selector(df)
categorical_columns

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'slope',
 'ca',
 'thal',
 'target']

In [59]:
df_cat = df[categorical_columns] #отберем только те столбцы в которых присутствует категориальная перемнная
df_cat.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,0,0,3,0
3,61,1,0,148,203,0,1,161,0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1,3,2,0


In [60]:
from sklearn.preprocessing import OrdinalEncoder

education_column = df_cat['ca']
print(df_cat['ca'].unique()) #посмотрим какие виды образования присутствуют в выборке
encoder = OrdinalEncoder() # вызываем кодировщик
encoder = encoder.fit(df_cat[['ca']])
df_cat['ca'] = encoder.transform(df_cat[['ca']])
df_cat

[2 0 1 3 4]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat['ca'] = encoder.transform(df_cat[['ca']])


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,2,2.0,3,0
1,53,1,0,140,203,1,0,155,1,0,0.0,3,0
2,70,1,0,145,174,0,1,125,1,0,0.0,3,0
3,61,1,0,148,203,0,1,161,0,2,1.0,3,0
4,62,0,0,138,294,1,1,106,0,1,3.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,2,0.0,2,1
1021,60,1,0,125,258,0,0,141,1,1,1.0,3,0
1022,47,1,0,110,275,0,0,118,1,1,1.0,2,0
1023,50,0,0,110,254,0,0,159,0,2,0.0,2,1


In [61]:
print(encoder.categories_)
print(len(encoder.categories_[0]))
print(df_cat['ca'].unique())
df_cat['ca'].unique().size

[array([0, 1, 2, 3, 4], dtype=int64)]
5
[2. 0. 1. 3. 4.]


5

In [62]:
# OneHotEncoder

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
encoder = encoder.fit(df_cat[['ca']])
education_encoded = encoder.transform(df_cat[['ca']])
education_encoded



array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [63]:
feature_names = encoder.get_feature_names_out(input_features=["ca"]) 
# получаем виды образования и на их основе создаем столбцы
education_encoded = pd.DataFrame(education_encoded, columns=feature_names)
education_encoded

Unnamed: 0,ca_0.0,ca_1.0,ca_2.0,ca_3.0,ca_4.0
0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
1020,1.0,0.0,0.0,0.0,0.0
1021,0.0,1.0,0.0,0.0,0.0
1022,0.0,1.0,0.0,0.0,0.0
1023,1.0,0.0,0.0,0.0,0.0


In [64]:
df_cat_new=df_cat.join(education_encoded)
df_cat_new=df_cat_new.drop(df_cat_new[['ca']],axis=1)
df_cat_new

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,thal,target,ca_0.0,ca_1.0,ca_2.0,ca_3.0,ca_4.0
0,52,1,0,125,212,0,1,168,0,2,3,0,0.0,0.0,1.0,0.0,0.0
1,53,1,0,140,203,1,0,155,1,0,3,0,1.0,0.0,0.0,0.0,0.0
2,70,1,0,145,174,0,1,125,1,0,3,0,1.0,0.0,0.0,0.0,0.0
3,61,1,0,148,203,0,1,161,0,2,3,0,0.0,1.0,0.0,0.0,0.0
4,62,0,0,138,294,1,1,106,0,1,2,0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,2,2,1,1.0,0.0,0.0,0.0,0.0
1021,60,1,0,125,258,0,0,141,1,1,3,0,0.0,1.0,0.0,0.0,0.0
1022,47,1,0,110,275,0,0,118,1,1,2,0,0.0,1.0,0.0,0.0,0.0
1023,50,0,0,110,254,0,0,159,0,2,2,1,1.0,0.0,0.0,0.0,0.0


In [65]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder() # вызываем кодировщик
encoder = encoder.fit(df_cat[['ca']])
df_cat['ca'] = encoder.transform(df_cat[['ca']])
df_cat

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat['ca'] = encoder.transform(df_cat[['ca']])


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,0,0,3,0
3,61,1,0,148,203,0,1,161,0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,2,0,2,1


In [66]:
print(encoder.classes_)
print(len(encoder.classes_))
print(df_cat['ca'].unique())
df_cat['ca'].unique().size

[0. 1. 2. 3. 4.]
5
[2 0 1 3 4]


5

In [67]:
# Масштабирование признаков

from sklearn.preprocessing import StandardScaler

X_train = np.array([[ 1., -1., 2.],
                     [ 2., 0., 0.],
                     [ 0., 1., -1.]])

scaler = StandardScaler()
scaler.fit(X_train)
print('Математическое ожидание')
print(scaler.mean_)
print('Дисперсия')
print(scaler.var_)
print('Преобразованный набор')
print(scaler.transform(X_train))

Математическое ожидание
[1.         0.         0.33333333]
Дисперсия
[0.66666667 0.66666667 1.55555556]
Преобразованный набор
[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]


In [68]:
from sklearn.preprocessing import MinMaxScaler

X_train = np.array([[ 1., -1., 2.],
                     [ 2., 0., 0.],
                    [ 0., 1., -1.]])

min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [69]:
# MaxAbsScaler

from sklearn.preprocessing import MaxAbsScaler

X_train = np.array([[ 1., -1., 2.],
                     [ 2., 0., 0.],
                     [ 0., 1., -1.]])

max_abs_scaler = MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [70]:
# Нормализация

from sklearn import preprocessing

X = [[ 1., -1., 2.],
    [ 2., 0., 0.],
    [ 0., 1., -1.]]

X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])