In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


In [81]:
data = pd.read_csv("titanic.csv")


In [82]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home_dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB
None
            pclass     survived          age        sibsp        parch  \
count  1309.000000  1309.000000  1046.000000  1309.000000  1309.000000   
mean      2.294882     0.381971    2

In [83]:
data.drop(columns=['boat', 'body', 'cabin', 'ticket'], errors='ignore', inplace=True)
# imię zostawiam

In [84]:
data['sex'] = LabelEncoder().fit_transform(data['sex'])
data['embarked'] = LabelEncoder().fit_transform(data['embarked'])
data['pclass'] = data['pclass'].astype('category')
data['embarked'] = data['embarked'].astype('category')
data['sex'] = data['sex'].astype('category')
# Praca nad wartosciami

In [85]:
imp = IterativeImputer()

data['embarked'] = imp.fit_transform(data[['embarked']])
data['pclass'] = data['pclass'].astype(float)
# Wypelnianie

In [86]:
data['title'] = data['name'].str.extract(r' ([A-Za-z]+)\.', expand=False) #Wyciąganie tutuły z imienia
data['age'] = data['age'].fillna(data.groupby(['title', 'sex'], observed=False)['age'].transform('mean'))
# Wypełnianie wieku

In [87]:
data['family_size'] = data['sibsp'] + data['parch'] + 1
data['age_range'] = pd.cut(data['age'], bins=[0, 6, 12, 18, 67], labels=['Bobas', 'Dzieciak', 'Nastolatek', 'Dorosly'])
data['age_range'] = data['age_range'].cat.codes

data['mpc'] = data['age'] * data['pclass']

# Dodatkowe wartosci

In [88]:
data.dropna(inplace=True)
# Na wszelki wypadek

In [89]:

scaler = StandardScaler()
data[['age', 'fare', 'family_size', 'mpc']] = scaler.fit_transform(data[['age', 'fare', 'family_size', 'mpc']])
# Normalizacja

In [99]:
X = data[['sex', 'age', 'age_range', 'pclass', 'fare', 'family_size', 'mpc', 'parch']]
y = data['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [91]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("RandomForestClassifier")
print("Accuracy:", accuracy_score(y_test, y_pred))

RandomForestClassifier
Accuracy: 0.7651006711409396


In [92]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression
Accuracy: 0.7516778523489933


In [93]:
model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred))

Naive Bayes
Accuracy: 0.7248322147651006


In [94]:
model = Perceptron(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Perceptron")
print("Accuracy:", accuracy_score(y_test, y_pred))

Perceptron
Accuracy: 0.4966442953020134


In [95]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("KNN")
print("Accuracy:", accuracy_score(y_test, y_pred))

KNN
Accuracy: 0.6845637583892618


In [100]:
model = GradientBoostingClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test, y_pred))
# Ten osiąga 80%

Gradient Boosting
Accuracy: 0.8120805369127517
