In [43]:
# Following section in book: Using the majority voting principle to make predictions

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
import xgboost as xgb
import numpy as np
from sklearn.pipeline import Pipeline

def feature_eng(data):
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['Fare'].fillna(data['Fare'].mean(),inplace=True)
    eng_title(data)
    eng_age(data)

def eng_title(data):
    data['Title']=0
    data['Title']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    data['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Dona','Jonkheer','Col',
                         'Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

def eng_age(data):
    data.loc[(data.Age.isnull())&(data.Title=='Mr'),'Age']= data.Age[data.Title=="Mr"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Mrs'),'Age']= data.Age[data.Title=="Mrs"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Master'),'Age']= data.Age[data.Title=="Master"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Miss'),'Age']= data.Age[data.Title=="Miss"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Other'),'Age']= data.Age[data.Title=="Other"].mean()


train_data = pd.read_csv('train.csv')
feature_eng(train_data)

test_data = pd.read_csv('test.csv')
feature_eng(test_data)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Fare', 'Embarked']

X_train_orig = train_data[features]
y_train_orig = train_data['Survived']

X_test = test_data[features]

X_train_orig = pd.get_dummies(X_train_orig)
X_test = pd.get_dummies(X_test)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_orig, y_train_orig, stratify=y_train_orig, random_state=1)

# No need to scale this guy.
clf1 = xgb.XGBClassifier(max_depth=4, learning_rate=.01, n_estimators=300,random_state=1)

clf2 = LogisticRegression(penalty='l2', C=0.001,solver='lbfgs',random_state=1)
pipe2 = Pipeline([['sc', StandardScaler()], ['clf', clf2]])

clf3 = KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski', 
                           metric_params=None, n_neighbors=6, p=2, 
                           weights='uniform')
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

clf4 = LinearSVC(random_state=1, C=0.1, max_iter=100000, dual='auto')
pipe4 = Pipeline([['sc', StandardScaler()], ['clf', clf4]])

clf5 = RandomForestClassifier(random_state=1, n_estimators=350, max_features=3)

clf6 = GaussianNB()
pipe6 = Pipeline([['sc', StandardScaler()], ['clf', clf6]])

clf_labels = ['XGBoost', 'Logistic regression', 'KNN', 'LinearSVC', 'RandomForest', 'GaussianNB']

all_clf = [clf1, pipe2, pipe3, pipe4, clf5, pipe6]


mv_hard_clf = VotingClassifier(estimators=list(zip(clf_labels, all_clf)), voting='hard')
soft_clf = [clf1,pipe2,pipe3,clf5,pipe6]
mv_soft_clf = VotingClassifier(estimators=list(zip(clf_labels, soft_clf)), voting='soft')

#use mv_clf.get_params() to see what parameters you can optimize
clf_labels += ['Voting-hard', 'Voting-soft']
all_clf += [mv_hard_clf,mv_soft_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train,y=y_train,cv=10, scoring='accuracy')
    print(f'Accuracy: {scores.mean():.2f} ' f'(+/- {scores.std():.2f}) [{label}]')

mv_hard_clf.fit(X_train_orig, y_train_orig)
predictions = mv_hard_clf.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_ensemble_hard.csv', index=False)

# #TODO: write to CSV

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Age         418 non-null    float64
 2   SibSp       418 non-null    int64  
 3   Parch       418 non-null    int64  
 4   FamilySize  418 non-null    int64  
 5   Fare        418 non-null    float64
 6   Sex_female  418 non-null    bool   
 7   Sex_male    418 non-null    bool   
 8   Embarked_C  418 non-null    bool   
 9   Embarked_Q  418 non-null    bool   
 10  Embarked_S  418 non-null    bool   
dtypes: bool(5), float64(2), int64(4)
memory usage: 21.8 KB
Accuracy: 0.82 (+/- 0.03) [XGBoost]
Accuracy: 0.68 (+/- 0.03) [Logistic regression]
Accuracy: 0.80 (+/- 0.03) [KNN]
Accuracy: 0.80 (+/- 0.04) [LinearSVC]
Accuracy: 0.80 (+/- 0.05) [RandomForest]
Accuracy: 0.80 (+/- 0.03) [GaussianNB]
Accuracy: 0.80 (+/- 0.04) [Voting-hard]
Accuracy: 0.81 (+/- 0.03) [V