In [10]:
import pandas as pd

# Загрузка датасета Titanic
url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
data = pd.read_csv(url)
print(data.head())

   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  


In [11]:
# Заполнение пропущенных значений медианой для числовых признаков и модой для категориальных
for col in data.columns:
    if data[col].dtype == "object":
        data[col] = data[col].fillna(data[col].mode()[0])
    else:
        data[col] = data[col].fillna(data[col].median())

print(data.isnull().sum())  # Проверка на оставшиеся пропущенные значения

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64


In [12]:
from sklearn.preprocessing import OneHotEncoder

# Кодирование категориальных признаков с помощью OneHotEncoder
encoder = OneHotEncoder(sparse=False)
categorical_columns = data.select_dtypes(include=['object']).columns
data_encoded = pd.DataFrame(encoder.fit_transform(data[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))
data = data.drop(categorical_columns, axis=1)
data = pd.concat([data, data_encoded], axis=1)

print(data.head())

   Survived  Pclass   Age  Siblings/Spouses Aboard  Parents/Children Aboard  \
0         0       3  22.0                        1                        0   
1         1       1  38.0                        1                        0   
2         1       3  26.0                        0                        0   
3         1       1  35.0                        1                        0   
4         0       3  35.0                        0                        0   

      Fare  Name_Capt. Edward Gifford Crosby  Name_Col. John Weir  \
0   7.2500                               0.0                  0.0   
1  71.2833                               0.0                  0.0   
2   7.9250                               0.0                  0.0   
3  53.1000                               0.0                  0.0   
4   8.0500                               0.0                  0.0   

   Name_Col. Oberst Alfons Simonius-Blumer  Name_Don. Manuel E Uruchurtu  ...  \
0                            



In [13]:
from sklearn.preprocessing import MinMaxScaler

# Нормализация числовых признаков
scaler = MinMaxScaler()
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

print(data.head())

   Survived  Pclass       Age  Siblings/Spouses Aboard  \
0       0.0     1.0  0.271174                    0.125   
1       1.0     0.0  0.472229                    0.125   
2       1.0     1.0  0.321438                    0.000   
3       1.0     0.0  0.434531                    0.125   
4       0.0     1.0  0.434531                    0.000   

   Parents/Children Aboard      Fare  Name_Capt. Edward Gifford Crosby  \
0                      0.0  0.014151                               0.0   
1                      0.0  0.139136                               0.0   
2                      0.0  0.015469                               0.0   
3                      0.0  0.103644                               0.0   
4                      0.0  0.015713                               0.0   

   Name_Col. John Weir  Name_Col. Oberst Alfons Simonius-Blumer  \
0                  0.0                                      0.0   
1                  0.0                                      0.0   
2    