In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("titanic.csv")
df.drop(["PassengerId", "Name", "Ticket"], axis = 1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,34.5,0,0,7.8292,,Q
1,1,3,female,47.0,1,0,7.0,,S
2,0,2,male,62.0,0,0,9.6875,,Q
3,0,3,male,27.0,0,0,8.6625,,S
4,1,3,female,22.0,1,1,12.2875,,S


In [3]:
df.to_csv("Droped_Titanic.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Cabin     91 non-null     object 
 8   Embarked  418 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 29.5+ KB


## Count Null Values

In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [6]:
# Drop Cabin Cloumn
df.drop(["Cabin"], axis = 1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [7]:
# Unique values
print(df["Embarked"].nunique())
print(df["Parch"].nunique())
print(df["SibSp"].nunique())

3
8
7


# Catagorical Encoder

In [8]:
# Dummy Variable
cloumns = ['Sex', 'Parch', 'Embarked', 'SibSp']

encoder = OneHotEncoder(drop='first', sparse=False)

# transform data
onehot = encoder.fit_transform(df[cloumns])
onehot.shape

(418, 16)

In [9]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [10]:
df = df.drop(['Sex', 'Parch', 'Embarked', 'SibSp'], axis=1)
df

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,34.5,7.8292
1,1,3,47.0,7.0000
2,0,2,62.0,9.6875
3,0,3,27.0,8.6625
4,1,3,22.0,12.2875
...,...,...,...,...
413,0,3,,8.0500
414,1,1,39.0,108.9000
415,0,3,38.5,7.2500
416,0,3,,8.0500


In [11]:
onehot = pd.DataFrame(onehot)
df = pd.concat([df, onehot], axis=1)
df

Unnamed: 0,Survived,Pclass,Age,Fare,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0,3,34.5,7.8292,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,47.0,7.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,2,62.0,9.6875,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,3,27.0,8.6625,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,3,22.0,12.2875,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,3,,8.0500,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
414,1,1,39.0,108.9000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415,0,3,38.5,7.2500,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
416,0,3,,8.0500,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Missing Values

In [12]:
# statistical imputation transform for the horse colic dataset
from numpy import isnan
from sklearn.impute import SimpleImputer

data = df.values
ix = [i for i in range(data.shape[1]) if i != 0]
X, y = data[:, ix], data[:, 0]

# summarize total missing
print('Missing: %d' % sum(isnan(X).flatten()))
# define imputer
imputer = SimpleImputer(strategy= 'mean' )
# fit on the dataset
imputer.fit(X) # X still has 1605 Missing Data, just compute
# transform the dataset
Xtrans = imputer.transform(X) # Xtrans has no Missing Data

# summarize total missing
print('Missing: %d' % sum(isnan(Xtrans).flatten())) # 0

Missing: 87
Missing: 0


In [13]:
X = pd.DataFrame(Xtrans)
X.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,3.0,34.5,7.8292,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,47.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2.0,62.0,9.6875,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,27.0,8.6625,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,22.0,12.2875,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,3.0,14.0,9.225,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,3.0,30.0,7.6292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2.0,26.0,29.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
8,3.0,18.0,7.2292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,3.0,21.0,24.15,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
y.shape

(418,)

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       418 non-null    float64
 1   1       418 non-null    float64
 2   2       418 non-null    float64
 3   3       418 non-null    float64
 4   4       418 non-null    float64
 5   5       418 non-null    float64
 6   6       418 non-null    float64
 7   7       418 non-null    float64
 8   8       418 non-null    float64
 9   9       418 non-null    float64
 10  10      418 non-null    float64
 11  11      418 non-null    float64
 12  12      418 non-null    float64
 13  13      418 non-null    float64
 14  14      418 non-null    float64
 15  15      418 non-null    float64
 16  16      418 non-null    float64
 17  17      418 non-null    float64
 18  18      418 non-null    float64
dtypes: float64(19)
memory usage: 62.2 KB


In [17]:
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [18]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [19]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0
