In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
demo = pd.read_csv("data/gender_submission.csv")

In [3]:
train.shape

(891, 12)

In [4]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
train.drop('Cabin', inplace=True, axis=1)

In [7]:
test.drop('Cabin', inplace=True, axis=1)

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
imputer.fit(train[['Age']])
train['Age'] = imputer.transform(train[['Age']])
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [9]:
test['Age'] = imputer.transform(test[['Age']])
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
dtype: int64

In [10]:
train.Embarked.fillna(train.Embarked.mode()[0], inplace = True)
train.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train.Embarked.fillna(train.Embarked.mode()[0], inplace = True)


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [11]:
imputer2 = SimpleImputer()
imputer2.fit(train[['Fare']])
test['Fare'] = imputer2.transform(test[['Fare']])
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [12]:
train.drop(["Name", "Ticket", "PassengerId"], axis = 1, inplace = True)
test.drop(["Name", "Ticket", "PassengerId"], axis = 1, inplace = True)

In [13]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [14]:
X = pd.concat([X,pd.get_dummies(X["Embarked"], drop_first=True).astype(int), 
                   pd.get_dummies(X["Sex"], drop_first=True).astype(int)], axis = 1)
X = X.drop(["Embarked", "Sex"], axis = 1)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Q,S,male
0,3,22.000000,1,0,7.2500,0,1,1
1,1,38.000000,1,0,71.2833,0,0,0
2,3,26.000000,0,0,7.9250,0,1,0
3,1,35.000000,1,0,53.1000,0,1,0
4,3,35.000000,0,0,8.0500,0,1,1
...,...,...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,0,1,1
887,1,19.000000,0,0,30.0000,0,1,0
888,3,29.699118,1,2,23.4500,0,1,0
889,1,26.000000,0,0,30.0000,0,0,1


In [15]:
test = pd.concat([test,pd.get_dummies(test["Embarked"], drop_first=True).astype(int), 
                   pd.get_dummies(test["Sex"], drop_first=True).astype(int)], axis = 1)
test = test.drop(["Embarked", "Sex"], axis = 1)
test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Q,S,male
0,3,34.500000,0,0,7.8292,1,0,1
1,3,47.000000,1,0,7.0000,0,1,0
2,2,62.000000,0,0,9.6875,1,0,1
3,3,27.000000,0,0,8.6625,0,1,1
4,3,22.000000,1,1,12.2875,0,1,0
...,...,...,...,...,...,...,...,...
413,3,29.699118,0,0,8.0500,0,1,1
414,1,39.000000,0,0,108.9000,0,0,0
415,3,38.500000,0,0,7.2500,0,1,1
416,3,29.699118,0,0,8.0500,0,1,1


In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns=X.columns)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Q,S,male
0,1.0,0.271174,0.125,0.000000,0.014151,0.0,1.0,1.0
1,0.0,0.472229,0.125,0.000000,0.139136,0.0,0.0,0.0
2,1.0,0.321438,0.000,0.000000,0.015469,0.0,1.0,0.0
3,0.0,0.434531,0.125,0.000000,0.103644,0.0,1.0,0.0
4,1.0,0.434531,0.000,0.000000,0.015713,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
886,0.5,0.334004,0.000,0.000000,0.025374,0.0,1.0,1.0
887,0.0,0.233476,0.000,0.000000,0.058556,0.0,1.0,0.0
888,1.0,0.367921,0.125,0.333333,0.045771,0.0,1.0,0.0
889,0.0,0.321438,0.000,0.000000,0.058556,0.0,0.0,1.0


In [17]:
test = pd.DataFrame(scaler.transform(test), columns=test.columns)
test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Q,S,male
0,1.0,0.428248,0.000,0.000000,0.015282,1.0,0.0,1.0
1,1.0,0.585323,0.125,0.000000,0.013663,0.0,1.0,0.0
2,0.5,0.773813,0.000,0.000000,0.018909,1.0,0.0,1.0
3,1.0,0.334004,0.000,0.000000,0.016908,0.0,1.0,1.0
4,1.0,0.271174,0.125,0.166667,0.023984,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
413,1.0,0.367921,0.000,0.000000,0.015713,0.0,1.0,1.0
414,0.0,0.484795,0.000,0.000000,0.212559,0.0,0.0,0.0
415,1.0,0.478512,0.000,0.000000,0.014151,0.0,1.0,1.0
416,1.0,0.367921,0.000,0.000000,0.015713,0.0,1.0,1.0


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

forest = RandomForestClassifier(n_estimators = 80)
forest.fit(X_train, y_train)
y_forest = forest.predict(X_val)
print(classification_report(y_val, y_forest))

              precision    recall  f1-score   support

           0       0.77      0.89      0.82       106
           1       0.79      0.62      0.69        73

    accuracy                           0.78       179
   macro avg       0.78      0.75      0.76       179
weighted avg       0.78      0.78      0.77       179



In [24]:
y_pred = forest.predict(test)

In [25]:
test2 = pd.read_csv("data/test.csv")

output = pd.DataFrame({'PassengerId': test2.PassengerId, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)