In [1]:
#import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#import dataset
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
#check dataset
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
#check dataset
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df_train_survived = df_train['Survived']
df_train = df_train.drop(['PassengerId', 'Survived', 'Ticket', 'Cabin'],axis=1)
df_test_id = df_test['PassengerId']
df_test = df_test.drop(['PassengerId', 'Ticket', 'Cabin'],axis=1)

In [6]:
df_train.isna().sum()

Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

# Extract title from name

In [7]:
dataframe = [df_train, df_test]

for df in dataframe:
    df['Title'] = df['Name'].str.split(', ').str[1]
    df['Title'] = df['Title'].str.split('.').str[0]
    
df_train = df_train.drop(['Name'],axis=1)
df_test = df_test.drop(['Name'],axis=1)

# Change values in Sex to numeric

In [8]:
dataframe = [df_train, df_test]

for df in dataframe:
    df['Sex'] = df['Sex'].replace({'female':0, 'male':1})

# Title Grouping

In [9]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

dataframe = [df_train, df_test]

for df in dataframe:
    df['Title'] = df.apply(replace_titles, axis=1)
    df['Title'] = df['Title'].replace({'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3})

# Change values in Embarked to numeric

In [10]:
df_train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [11]:
dataframe = [df_train, df_test]

for df in dataframe:
    df['Embarked'] = df['Embarked'].replace({'S':0, 'C':1, 'Q':2})

In [12]:
df_train.corr()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
Pclass,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495,0.043835,-0.025429
Sex,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333,-0.118593,-0.692582
Age,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067,0.012186,-0.432974
SibSp,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651,-0.060606,0.324305
Parch,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225,-0.07932,0.339626
Fare,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0,0.063462,0.153547
Embarked,0.043835,-0.118593,0.012186,-0.060606,-0.07932,0.063462,1.0,0.128561
Title,-0.025429,-0.692582,-0.432974,0.324305,0.339626,0.153547,0.128561,1.0


# Handling missing values using MICE

In [13]:
# Handling missing values in df_train
# need to enable iterative imputer explicitly since its still experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

# Use Numeric Features
df_train_impute = df_train[['Sex', 'Age', 'Fare', 'Embarked']]
df_test_impute = df_test[['Sex', 'Age', 'Fare', 'Embarked']]

# fit on the dataset
imputer.fit(df_train_impute)
imputer.fit(df_test_impute)

df_imputed_train = imputer.transform(df_train_impute)
df_imputed_train[:10]
df_imputed_test = imputer.transform(df_test_impute)
df_imputed_test[:10]


# Replace with imputed values
df_train_impute.loc[:,:] = df_imputed_train
df_test_impute.loc[:,:] = df_imputed_test


# Fill nan values in df_train
df_train = df_train.fillna(df_train_impute)
df_test = df_test.fillna(df_test_impute)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [14]:
# Int Age and Embarked
dataframe = [df_train, df_test]

for df in dataframe:
    df['Age'] = df['Age'].astype(int)
    df['Embarked'] = df['Embarked'].astype(int)

# Create Pclass dummies

In [15]:
# Create dummies for train
pclass_columns = pd.get_dummies(df_train['Pclass'])
Pclass_1 = pclass_columns.loc[:,1:1].max(axis=1)
Pclass_2 = pclass_columns.loc[:,2:2].max(axis=1)
Pclass_3 = pclass_columns.loc[:,3:3].max(axis=1)
df_train = pd.concat([df_train, Pclass_1, Pclass_2,Pclass_3], axis=1)

# Create dummies for test
pclass_columns = pd.get_dummies(df_test['Pclass'])
Pclass_1 = pclass_columns.loc[:,1:1].max(axis=1)
Pclass_2 = pclass_columns.loc[:,2:2].max(axis=1)
Pclass_3 = pclass_columns.loc[:,3:3].max(axis=1)
df_test = pd.concat([df_test, Pclass_1, Pclass_2,Pclass_3], axis=1)

# Rename columns
dataframe = [df_train, df_test]

for df in dataframe:
    df.rename(columns = {0:'Pclass_1', 1:'Pclass_2', 2:'Pclass_3'}, inplace = True)
    
df_train = df_train.drop(['Pclass'],axis=1)
df_test = df_test.drop(['Pclass'],axis=1)

# Create Title dummies 

In [16]:
# Create dummies for train
title_columns = pd.get_dummies(df_train['Title'])
Mr = title_columns.loc[:,0:0].max(axis=1)
Mrs = title_columns.loc[:,1:1].max(axis=1)
Miss = title_columns.loc[:,2:2].max(axis=1)
Master = title_columns.loc[:,3:3].max(axis=1)
df_train = pd.concat([df_train, Mr, Mrs, Miss, Master], axis=1)

# Create dummies for test
title_columns = pd.get_dummies(df_test['Title'])
Mr = title_columns.loc[:,0:0].max(axis=1)
Mrs = title_columns.loc[:,1:1].max(axis=1)
Miss = title_columns.loc[:,2:2].max(axis=1)
Master = title_columns.loc[:,3:3].max(axis=1)
df_test = pd.concat([df_test, Mr, Mrs, Miss, Master], axis=1)

# Rename columns
dataframe = [df_train, df_test]

for df in dataframe:
    df.rename(columns = {0:'Mr', 1:'Mrs', 2:'Miss', 3:'Master'}, inplace = True)
    
df_train = df_train.drop(['Title'],axis=1)
df_test = df_test.drop(['Title'],axis=1)

# Create Embarked dummies 

In [17]:
df_train['Embarked'].unique()

array([0, 1, 2])

In [18]:
# Create dummies for train
embarked_columns = pd.get_dummies(df_train['Embarked'])
S = embarked_columns.loc[:,0:0].max(axis=1)
C = embarked_columns.loc[:,1:1].max(axis=1)
Q = embarked_columns.loc[:,2:2].max(axis=1)
df_train = pd.concat([df_train, S, C, Q], axis=1)

# Create dummies for test
embarked_columns = pd.get_dummies(df_test['Embarked'])
S = embarked_columns.loc[:,0:0].max(axis=1)
C = embarked_columns.loc[:,1:1].max(axis=1)
Q = embarked_columns.loc[:,2:2].max(axis=1)
df_test = pd.concat([df_test, S, C, Q], axis=1)

# Rename columns
dataframe = [df_train, df_test]

for df in dataframe:
    df.rename(columns = {0:'S', 1:'C', 2:'Q'}, inplace = True)
    
df_train = df_train.drop(['Embarked'],axis=1)
df_test = df_test.drop(['Embarked'],axis=1)

In [19]:
df_train1 = df_train.copy()
df_test1 = df_test.copy()

In [20]:
df_train = df_train1.copy()
df_test = df_test1.copy()

In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sex       891 non-null    int64  
 1   Age       891 non-null    int32  
 2   SibSp     891 non-null    int64  
 3   Parch     891 non-null    int64  
 4   Fare      891 non-null    float64
 5   Pclass_1  891 non-null    uint8  
 6   Pclass_2  891 non-null    uint8  
 7   Pclass_3  891 non-null    uint8  
 8   Mr        891 non-null    uint8  
 9   Mrs       891 non-null    uint8  
 10  Miss      891 non-null    uint8  
 11  Master    891 non-null    uint8  
 12  S         891 non-null    uint8  
 13  C         891 non-null    uint8  
 14  Q         891 non-null    uint8  
dtypes: float64(1), int32(1), int64(3), uint8(10)
memory usage: 40.2 KB


In [22]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sex       418 non-null    int64  
 1   Age       418 non-null    int32  
 2   SibSp     418 non-null    int64  
 3   Parch     418 non-null    int64  
 4   Fare      418 non-null    float64
 5   Pclass_1  418 non-null    uint8  
 6   Pclass_2  418 non-null    uint8  
 7   Pclass_3  418 non-null    uint8  
 8   Mr        418 non-null    uint8  
 9   Mrs       418 non-null    uint8  
 10  Miss      418 non-null    uint8  
 11  Master    418 non-null    uint8  
 12  S         418 non-null    uint8  
 13  C         418 non-null    uint8  
 14  Q         418 non-null    uint8  
dtypes: float64(1), int32(1), int64(3), uint8(10)
memory usage: 18.9 KB


# Build Machine Learning Model

In [23]:
X_train = df_train
Y_train = df_train_survived
X_test  = df_test

# Random Forest

# Logistic Regression

# KNN

# Gaussian Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)  
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

# Decision Tree

# Models Score

In [26]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(gaussian, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions)

array([[448, 101],
       [ 78, 264]], dtype=int64)

In [27]:
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(Y_train, predictions))
print("Recall:",recall_score(Y_train, predictions))

Precision: 0.7232876712328767
Recall: 0.7719298245614035


In [28]:
from sklearn.metrics import f1_score
f1_score(Y_train, predictions)

0.7468175388967468

In [29]:
result = pd.DataFrame({
    'PassengerId': df_test_id,
    'Survived': Y_pred})

In [None]:
# result.to_csv('Titanic-althair.guntur.csv', index=False)