In [177]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_regression

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Загрузка данных

## Чтение

In [178]:
df = pd.read_csv('train.csv', delimiter=',')

In [179]:
print(df.shape)

df.info()

(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [180]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Подготовка

In [181]:
df = df[['Survived', 'Sex', 'Age']].dropna().reset_index(drop=True)
df.shape

(714, 3)

In [182]:
scaler = StandardScaler()
scaler.fit(df['Age'].values.reshape(-1, 1))
arr = scaler.transform(df['Age'].values.reshape(-1, 1))
df['Age_scaled'] = arr
print(arr.shape)
#arr

(714, 1)


In [183]:
labelencoder = LabelEncoder()
df['Sex_Cat'] = labelencoder.fit_transform(df['Sex'])
df.shape, df.head()

((714, 5),
    Survived     Sex   Age  Age_scaled  Sex_Cat
 0         0    male  22.0   -0.530377        1
 1         1  female  38.0    0.571831        0
 2         1  female  26.0   -0.254825        0
 3         1  female  35.0    0.365167        0
 4         0    male  35.0    0.365167        1)

In [184]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(df[['Sex_Cat']]).toarray())
#enc_df

In [185]:
df = df.join(enc_df)
df = df.rename(columns={0: "female", 1: "male"})
df

Unnamed: 0,Survived,Sex,Age,Age_scaled,Sex_Cat,female,male
0,0,male,22.0,-0.530377,1,0.0,1.0
1,1,female,38.0,0.571831,0,1.0,0.0
2,1,female,26.0,-0.254825,0,1.0,0.0
3,1,female,35.0,0.365167,0,1.0,0.0
4,0,male,35.0,0.365167,1,0.0,1.0
...,...,...,...,...,...,...,...
709,0,female,39.0,0.640719,0,1.0,0.0
710,0,male,27.0,-0.185937,1,0.0,1.0
711,1,female,19.0,-0.737041,0,1.0,0.0
712,1,male,26.0,-0.254825,1,0.0,1.0


In [186]:
x = df[['male', 'female', 'Age_scaled']]
y = df['Survived']
x.shape, y.shape

((714, 3), (714,))

In [187]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((535, 3), (179, 3), (535,), (179,))

# Обучение моделей

## KNN

In [188]:
knn = KNeighborsRegressor()
knn.fit(x_train, y_train)

In [189]:
for x, y, label in zip([x_train, x_test], [y_train, y_test], ['train', 'test']):
  pred = knn.predict(x)
  print(f'MSE {label}={mean_squared_error(y, pred):.2f}')
  print(f'MAE {label}={mean_absolute_error(y, pred):.2f}')
  print(f'R2 {label}={r2_score(y, pred):.2f}')  

MSE train=0.13
MAE train=0.25
R2 train=0.45
MSE test=0.21
MAE test=0.33
R2 test=0.12


## Логистическая регрессия

In [190]:
clf = LogisticRegression(random_state=0)
clf.fit(x_train, y_train)

In [191]:
for x, y, label in zip([x_train, x_test], [y_train, y_test], ['train', 'test']):
  pred = clf.predict(x)
  print(f'MSE {label}={mean_squared_error(y, pred):.2f}')
  print(f'MAE {label}={mean_absolute_error(y, pred):.2f}')
  print(f'R2 {label}={r2_score(y, pred):.2f}')    

MSE train=0.20
MAE train=0.20
R2 train=0.17
MSE test=0.28
MAE test=0.28
R2 test=-0.16


# Подбор параметров RandomizedSearchCV

## KNN

In [192]:
params = dict(
    n_neighbors=range(1, 40),
    weights=['uniform', 'distance'],
    p=[1, 2],
)

In [193]:
knn = KNeighborsRegressor()
cv = RandomizedSearchCV(knn, params, n_jobs=-1, cv=5)
cv.fit(x_train, y_train)

In [194]:
best_params = cv.best_params_
best_params

{'weights': 'uniform', 'p': 2, 'n_neighbors': 16}

In [195]:
knn = KNeighborsRegressor(**best_params)
knn.fit(x_train, y_train)

In [196]:
for x, y, label in zip([x_train, x_test], [y_train, y_test], ['train', 'test']):
  pred = knn.predict(x)
  print(f'MSE {label}={mean_squared_error(y, pred):.2f}')
  print(f'MAE {label}={mean_absolute_error(y, pred):.2f}')
  print(f'R2 {label}={r2_score(y, pred):.2f}')    

MSE train=0.14
MAE train=0.28
R2 train=0.42
MSE test=0.19
MAE test=0.34
R2 test=0.20


### Вывод
Модель стала точнее на тестовой выборке и менее точна на тренировочной, потому что стала менее переобученная.

## Логистическая регрессия

In [197]:
params = dict(
    tol = np.arange(0.00001, 0.001, 0.0001),
    max_iter = range(50, 150, 10)
)

In [198]:
clf = LogisticRegression(random_state=0)
cv = RandomizedSearchCV(clf, params, n_jobs=-1, cv=5)
cv.fit(x_train, y_train)

In [199]:
best_params = cv.best_params_
best_params

{'tol': 1e-05, 'max_iter': 140}

In [200]:
clf = LogisticRegression(**best_params, random_state=0)
clf.fit(x_train, y_train)

In [201]:
for x, y, label in zip([x_train, x_test], [y_train, y_test], ['train', 'test']):
  pred = clf.predict(x)
  print(f'MSE {label}={mean_squared_error(y, pred):.2f}')
  print(f'MAE {label}={mean_absolute_error(y, pred):.2f}')
  print(f'R2 {label}={r2_score(y, pred):.2f}')

MSE train=0.20
MAE train=0.20
R2 train=0.17
MSE test=0.28
MAE test=0.28
R2 test=-0.16


### Вывод
Ничего не поменялось

# Подбор параметров GridSearchCV

## KNN

In [202]:
params = dict(
    n_neighbors = range(1, 40),
    weights = ['uniform', 'distance'],
    p = [1, 2],
)

In [203]:
knn = KNeighborsRegressor()
cv = GridSearchCV(knn, params, n_jobs=-1, cv=5)
cv.fit(x_train, y_train)

In [204]:
best_params = cv.best_params_
best_params

{'n_neighbors': 24, 'p': 1, 'weights': 'uniform'}

In [205]:
knn = KNeighborsRegressor(**best_params)
knn.fit(x_train, y_train)

In [206]:
for x, y, label in zip([x_train, x_test], [y_train, y_test], ['train', 'test']):
  pred = knn.predict(x)
  print(f'MSE {label}={mean_squared_error(y, pred):.2f}')
  print(f'MAE {label}={mean_absolute_error(y, pred):.2f}')
  print(f'R2 {label}={r2_score(y, pred):.2f}')    

MSE train=0.15
MAE train=0.30
R2 train=0.39
MSE test=0.20
MAE test=0.35
R2 test=0.19


### Вывод
Модель так же стала точнее на тестовой выборке и менее точна на тренировочной, причём на тестовой выборке даже лучше чем RandomizedSearchCV.

## Логистическая регрессия

In [207]:
params = dict(
    tol = np.arange(0.00001, 0.001, 0.0001),
    max_iter = range(50, 150, 10)
)

In [208]:
clf = LogisticRegression(random_state=0)
cv = GridSearchCV(clf, params, n_jobs=-1, cv=5)
cv.fit(x_train, y_train)

In [209]:
best_params = cv.best_params_
best_params

{'max_iter': 50, 'tol': 1e-05}

In [210]:
clf = LogisticRegression(**best_params, random_state=0)
clf.fit(x_train, y_train)

In [211]:
for x, y, label in zip([x_train, x_test], [y_train, y_test], ['train', 'test']):
  pred = clf.predict(x)
  print(f'MSE {label}={mean_squared_error(y, pred):.2f}')
  print(f'MAE {label}={mean_absolute_error(y, pred):.2f}')
  print(f'R2 {label}={r2_score(y, pred):.2f}')

MSE train=0.20
MAE train=0.20
R2 train=0.17
MSE test=0.28
MAE test=0.28
R2 test=-0.16


### Вывод
Ничего не поменялось