In [53]:
import pandas as pd

## Load Data

In [54]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_ids = test["PassengerId"]

## Which columns have Unknowns?

In [55]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [56]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Clean both train and test data
* Remove irrelevant columns
* Fill Null Values

In [57]:
def clean(data):
    # Remove irrelevant columns
    data = data.drop(['PassengerId','Ticket', 'Cabin', 'Name'], axis = 1)
    # Fill null values
    data['Age'].fillna(data['Age'].median(), inplace = True)
    data['Fare'].fillna(data['Fare'].median(), inplace = True)
    data['Embarked'].fillna('Unknown', inplace = True)
    return data

In [58]:
train = clean(train)
test = clean(test)

In [59]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## Encode the categorical data

In [60]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
columns = ["Sex", "Embarked"]

for col in columns:
    train[col] = encoder.fit_transform(train[col])
    print(encoder.classes_)
    test[col] = encoder.fit_transform(test[col])
      
train.head(5)

['female' 'male']
['C' 'Q' 'S' 'Unknown']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


## Seperate Features from Labels

In [61]:
X = train.drop('Survived', axis = 1)
y = train['Survived']

## Train 3 seperate classifiers

In [68]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC

RF = RandomForestClassifier()
RF.fit(X, y)

GB = GradientBoostingClassifier()
GB.fit(X, y)

RBF_SVM = SVC()
RBF_SVM.fit(X, y);

## Train an ensemble model of the 3 classifiers

In [63]:
from sklearn.ensemble import StackingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

estimators = [
     ('Random Forest', RF),
    ('Gradient Boosting', GB),
    ('RBF SVM', RBF_SVM)
 ]

clf = StackingClassifier(estimators=estimators, final_estimator = LinearDiscriminantAnalysis())
clf.fit(X,y)

StackingClassifier(estimators=[('Random Forest', RandomForestClassifier()),
                               ('Gradient Boosting',
                                GradientBoostingClassifier()),
                               ('RBF SVM', SVC())],
                   final_estimator=LinearDiscriminantAnalysis())

## Predict the labels of the test data

In [64]:
preds = clf.predict(test)

## Create The dataframe and output it as a csv file

In [65]:
sub_df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": preds
                  })
sub_df.to_csv("submission.csv", index=False)