In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from scipy.stats import mode

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/Colab/datasets/Titanic-Dataset.csv'
# Загружаем датасет Titanic
df = pd.read_csv(path)
# Заполняем пропуски в Age медианой (~30 лет)
df['Age'] = df['Age'].fillna(df['Age'].median())
# Заполняем пропуски в Embarked самым частым значением ('S')
df['Embarked'] = df['Embarked'].fillna('S')
# Кодируем пол: male → 0, female → 1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
# Выбираем признаки: Age, Sex, Pclass
X = df[['Age', 'Sex', 'Pclass']]
# Целевая переменная: Survived (0 или 1)
y = df['Survived']
X, y = X.values, y.values
# Разделяем данные: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
models = [DecisionTreeClassifier(max_depth=3, random_state=i).fit(X_train, y_train) for i in range(5)]

In [5]:
predictions = np.array([model.predict(X_test) for model in models])

In [6]:
final_predict = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis = 0, arr=predictions)

In [7]:
accuracy_score(y_test, final_predict)

0.8044692737430168

In [8]:
# Random Forest
models = []
max_depth = 3
features_size = int(np.sqrt(X_train.shape[1]))
features_lst = []

for i in range(10):
  x_boot, y_boot = resample(X_train, y_train, random_state = i)
  features = np.random.choice(X_train.shape[1], size = features_size, replace = False)
  x_boot_sub = x_boot[:, features]

  model = DecisionTreeClassifier(max_depth=max_depth, random_state = i).fit(x_boot_sub, y_boot)
  models.append(model)
  features_lst.append(features)


In [9]:
preds = np.array([model.predict(X_test[:, features_lst[i]]) for i, model in enumerate(models)])

In [10]:
preds.shape

(10, 179)

In [11]:
rf_predict = mode(preds, axis = 0).mode
mode(preds, axis = 0).mode.shape
# rf_predict = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis = 0, arr = preds)

(179,)

In [12]:
accuracy_score(y_test, rf_predict)

0.7374301675977654

In [22]:
df_test = pd.read_csv('/content/drive/MyDrive/Colab/datasets/test.csv')

In [23]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [29]:
df_test[df_test['Sex'] == 'female']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23.0,1,0,21228,82.2667,B45,S
...,...,...,...,...,...,...,...,...,...,...,...
409,1301,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.7750,,S
410,1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q
411,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,C78,Q
412,1304,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.7750,,S


In [30]:
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].median())
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})
X_test = df_test[['Age', 'Sex', 'Pclass']]
X_test = X_test.values

In [31]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    int64  
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 36.1+ KB


In [15]:
model = RandomForestClassifier(n_estimators=100)

In [16]:
model.fit(X_train, y_train)

In [48]:
rf_pred = model.predict(X_test)

In [49]:
pred_data = np.stack((df_test['PassengerId'].values, rf_pred), axis = 1)

In [54]:
df_pred = pd.DataFrame(pred_data, columns = ['PassengerId', 'Survived'])


In [61]:
df_pred.to_csv('/content/drive/MyDrive/Colab/titanic_prediction.csv', index = False)

In [59]:
file = pd.read_csv('titanic_prediction.csv')

In [60]:
file

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1
...,...,...
413,1305,1
414,1306,1
415,1307,0
416,1308,1
