# Создание dataset для обучения

In [6]:
import pandas as pd

# Данные для задания
data = {
    'name': ['Anna', 'Bob', 'Anna', 'Diana', 'Edward', 'Bob'],
    'age': [28, 35, 28, None, 42, 35],
    'city': ['New York', 'Los Angeles', 'New York', 'Chicago', None, 'Los Angeles'],
    'income': [50000, 60000, 50000, 70000, None, 60000]
}

# Создаем DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,name,age,city,income
0,Anna,28.0,New York,50000.0
1,Bob,35.0,Los Angeles,60000.0
2,Anna,28.0,New York,50000.0
3,Diana,,Chicago,70000.0
4,Edward,42.0,,
5,Bob,35.0,Los Angeles,60000.0


# Подготовка данных для работы 

In [7]:
df.drop_duplicates(inplace=True) # Удаление дубликатов
df['age'].fillna(df['age'].mean(), inplace=True) # Заполнение пустых ячеек
df['income'].fillna(df['income'].median(), inplace=True)
df = pd.get_dummies(df, columns=['city'], drop_first=False) # One-Hot-Encoding превращение категориальных данных в числовые чтобы работыть с ними 
df.iloc[:, 3:] = df.iloc[:, 3:].astype('int') 
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(), inplace=True) # Заполнение пустых ячеек
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['income'].fillna(df['income'].median(), inplace=True)
1    0
3    1
4    0
Name: city_Chicago, dtype: int32' has dtype incompatible with bool, please expl

Unnamed: 0,name,age,income,city_Chicago,city_Los Angeles,city_New York
0,Anna,28.0,50000.0,0,0,1
1,Bob,35.0,60000.0,0,1,0
3,Diana,35.0,70000.0,1,0,0
4,Edward,42.0,60000.0,0,0,0


# Нормализация 
normalized value = x - min(x) / (max(x) - min(x))

In [8]:
df

Unnamed: 0,name,age,income,city_Chicago,city_Los Angeles,city_New York
0,Anna,28.0,50000.0,0,0,1
1,Bob,35.0,60000.0,0,1,0
3,Diana,35.0,70000.0,1,0,0
4,Edward,42.0,60000.0,0,0,0


In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
df[['age', 'income']] = scaler.fit_transform(df[['age', 'income']])
df

Unnamed: 0,name,age,income,city_Chicago,city_Los Angeles,city_New York
0,Anna,0.0,0.0,0,0,1
1,Bob,0.5,0.5,0,1,0
3,Diana,0.5,1.0,1,0,0
4,Edward,1.0,0.5,0,0,0


# Стандартизация
standard value = x - x.mean()/std(x)

In [10]:
standard_scaler = StandardScaler()
df[['age', 'income']] = standard_scaler.fit_transform(df[['age', 'income']])
df

Unnamed: 0,name,age,income,city_Chicago,city_Los Angeles,city_New York
0,Anna,-1.414214,-1.414214,0,0,1
1,Bob,0.0,0.0,0,1,0
3,Diana,0.0,1.414214,1,0,0
4,Edward,1.414214,0.0,0,0,0
