In [1]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
my_df = pd.read_csv("https://raw.githubusercontent.com/adityamulik/INFO-6105---Data-Science-Engineering-And-Tools/main/Data/Titanic_full.csv")
print(my_df.shape)
my_df.head(3)

(1309, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
def mydf_splitter(my_df,num_rows):
    return my_df[:num_rows].copy(),my_df[num_rows:]


train,test = mydf_splitter(my_df,891)

In [4]:
len(train)

891

In [5]:
len(test)

418

In [6]:
test = test.drop('Survived', axis=1)

In [7]:
pd.set_option('precision', 4)
pd.set_option('max_columns', 12)

In [8]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# Data Wrangling

In [10]:
target = train['Survived']
train = train.drop('Survived', axis=1)

In [11]:
data = train.append(test) #concatenating both train and test data for pre processing
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


In [13]:
target = pd.DataFrame(target)
target.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [14]:
data.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [15]:
data.drop(['Age', 'Cabin'], axis = 1, inplace = True)

In [16]:
data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       2
dtype: int64

In [17]:
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())

In [18]:
data['Embarked'] = data['Embarked'].fillna(method = 'ffill')

In [19]:
data.isnull().sum() 

PassengerId    0
Pclass         0
Name           0
Sex            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   SibSp        1309 non-null   int64  
 5   Parch        1309 non-null   int64  
 6   Ticket       1309 non-null   object 
 7   Fare         1309 non-null   float64
 8   Embarked     1309 non-null   object 
dtypes: float64(1), int64(4), object(4)
memory usage: 92.2+ KB


In [21]:
# Preprocessing

In [22]:
data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace = True) 

In [23]:
data['Embarked'].unique() 

array(['S', 'C', 'Q'], dtype=object)

In [24]:
data['Sex'].unique()

array(['male', 'female'], dtype=object)

In [25]:
embarked_dummies = pd.get_dummies(data.Embarked)
data = pd.concat([data,embarked_dummies], axis=1)
data = data.drop("Embarked", axis=1)

sex_dummies= pd.get_dummies(data.Sex)
data = pd.concat([data,sex_dummies], axis=1)
data = data.drop("Sex", axis=1)

In [26]:
data.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,C,Q,S,female,male
0,3,1,0,7.25,0,0,1,0,1
1,1,1,0,71.2833,1,0,0,1,0
2,3,0,0,7.925,0,0,1,1,0
3,1,1,0,53.1,0,0,1,1,0
4,3,0,0,8.05,0,0,1,0,1


In [27]:
(data.shape)

(1309, 9)

In [28]:
new_train_data = data.iloc[:891,]
new_train_data

Unnamed: 0,Pclass,SibSp,Parch,Fare,C,Q,S,female,male
0,3,1,0,7.2500,0,0,1,0,1
1,1,1,0,71.2833,1,0,0,1,0
2,3,0,0,7.9250,0,0,1,1,0
3,1,1,0,53.1000,0,0,1,1,0
4,3,0,0,8.0500,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
886,2,0,0,13.0000,0,0,1,0,1
887,1,0,0,30.0000,0,0,1,1,0
888,3,1,2,23.4500,0,0,1,1,0
889,1,0,0,30.0000,1,0,0,0,1


In [29]:
new_test_data = data.iloc[891:,]
new_test_data

Unnamed: 0,Pclass,SibSp,Parch,Fare,C,Q,S,female,male
891,3,0,0,7.8292,0,1,0,0,1
892,3,1,0,7.0000,0,0,1,1,0
893,2,0,0,9.6875,0,1,0,0,1
894,3,0,0,8.6625,0,0,1,0,1
895,3,1,1,12.2875,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
1304,3,0,0,8.0500,0,0,1,0,1
1305,1,0,0,108.9000,1,0,0,1,0
1306,3,0,0,7.2500,0,0,1,0,1
1307,3,0,0,8.0500,0,0,1,0,1


In [30]:
# Apply XGBoost Classifer

In [31]:
X_train, X_test, y_train, y_test = train_test_split(new_train_data, target, random_state=11,test_size = 0.3)

In [34]:
my_model = XGBClassifier(random_state = 11)
my_model.fit(X_train, y_train)

preds = my_model.predict(X_test)
acc = (accuracy_score(y_test, preds))
print(f'Accuracy Score: {acc}')

Accuracy Score: 0.835820895522388
