In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data_raw = pd.read_csv('../input/titanic/train.csv')
test_data_raw = pd.read_csv('../input/titanic/test.csv')

In [None]:
train_data_raw.sample(5)

In [None]:
columns_all = train_data_raw.columns
columns_all

# Exploratory Data Analysis

In [None]:
train_data_raw.info()

- There are 891 records and there are missing values in some of the columns.

In [None]:
train_data_raw.hist(figsize=(9,9))
plt.tight_layout()

In [None]:
train_data_raw['Survived'].value_counts().apply(lambda x:f'{x} ({x*100/len(train_data_raw):0.2f}%)')

Initial Inferences :

- The dataset is mildly imbalanced.
- The columns 'PassengerId' & 'Name' are unique identifiers.
- 'Survived' is the target column that we have to predict.
- The columns 'Pclass', 'Sex' and 'Embarked' are categorical columns and the rest are numerical.
- The column 'SibSp' should ideally be integer value.


We shall split our training data to train-test set before proceeding further to avoid any data leakage into test set.

In [None]:
from sklearn.model_selection import train_test_split

We shall create a copy of train_set so as to not loose the original training set during feature engineering.

In [None]:
train_set,test_set = train_test_split(train_data_raw,test_size=0.2,stratify=train_data_raw['Survived'],random_state=21)

In [None]:
train_original = train_set.copy()
train_set.reset_index(drop=True,inplace=True)

In [None]:
train_set.sample(5)

In [None]:
train_set.describe()

In [None]:
# No. of unique elements in each column
train_set.apply(lambda x: x.nunique())

In [None]:
num_cols = train_set.select_dtypes('number').columns.drop(['PassengerId','Survived','Pclass']).to_numpy()
cat_cols = list(train_set.select_dtypes('object').columns.drop(['Name']))
cat_cols.extend(['Pclass'])
print("Numerical Columns : ",num_cols)
print("Categorical Columns : ",cat_cols)

In [None]:
n_def_num_cols = len(num_cols)
fig,ax = plt.subplots(round(n_def_num_cols/2),2,figsize=(10,n_def_num_cols*2))
for i,col in enumerate(num_cols):
  sns.violinplot(x='Survived',y=col,data=train_set,ax=ax.ravel()[i],orient='v',cut=0)
fig.tight_layout()

In [None]:
fig,ax = plt.subplots(round(n_def_num_cols/2),2,figsize=(16,n_def_num_cols*2))
for i,col in enumerate(num_cols):
  sns.histplot(x=col,data=train_set,hue='Survived',multiple='dodge',ax=ax.ravel()[i],bins=20,lw=1)
fig.tight_layout()

- We can see that, the survival rate is higher for the kids and elderly. The survial rate for inividuals between 20-30 is very low.
- We could also observe that the chances of survival is increasing with the price paid for the ticket.

In [None]:
sns.pairplot(hue='Survived',data=train_set,corner=True)
plt.tight_layout()

- The classification of survival isnt linearly seperable with any of the feature.
- There arent any distinct correlation within various features.

In [None]:
train_corr = train_set.corr()

In [None]:
plt.subplots(figsize=(8,7))
sns.heatmap(train_corr,vmax=1,vmin=-1,annot=True,cmap=sns.color_palette("icefire", as_cmap=True))
plt.tight_layout()

In [None]:
print("Correlation of Features with 'Survived' \n")
train_corr.loc[:,'Survived'].sort_values(ascending=False).drop('Survived')

In [None]:
print("Correlation within Features  \n")
for i,y in enumerate(train_corr.index):
    for j,x in enumerate(train_corr.columns.drop('Survived')):
        if(j<i):
            continue
        if ((train_corr.loc[x,y] >0.4) or (train_corr.loc[x,y] <-0.4)) and x!=y:
            print(f'{x} - {y}  : {train_corr.loc[x,y]}')

- The Ticket class has the highest correlation with the target column 'Survived'
- The ticket fare and the ticket class are correlated which makes much sense.
- Also the number of siblings/spouses aboard is correlated with the number of parents/children aboard.

In [None]:
for i,col in enumerate(['Pclass','Sex','Embarked','Survived']):
  j=0
  fig,ax = plt.subplots(1,3,figsize=(16,4),)
  for col1 in ['Pclass','Sex','Embarked','Survived']:
    if col1!=col:    
      sns.countplot(x=col,data=train_set,hue=col1,ax=ax[j])
      j=j+1
  
  fig.suptitle(col,size=16)
  fig.tight_layout()

As we can see, some of the insights that can be drawn are
- The survival chances for females were much higher than males.
- The chances of survival  were higher for Individuals with TicketClass('Pclass')-1. Passengers with Class-1 Ticket has survived more than any other class.
- Passengers who embarked from port Cherbourg has a higher survival ratio.
- Most passengers with 1st class tickets survived and the survival rate was much higher than any other ticket class. It could also be noted that there were no 1st class passengers from Queenstown.
- Passengers embarked from Cherbourg has higher survival ratio.



In [None]:
from scipy.stats import chi2_contingency

In [None]:
alpha = 0.05
for col in cat_cols:
  cross_table = pd.crosstab(train_set[col],train_set['Survived'])
  chi2_stat,p_value, dof, exp = chi2_contingency(cross_table)
  if p_value <= alpha:
    print(f"{col}-Survived \np-value : ",p_value)
    print("Dependent (reject H0)",'\n')
  else:
    print(f"{col}-Survived \np-value : ",p_value)
    print("Independent (fail to reject H0)",'\n')

Only 'Cabin' had no relation with 'Survived' column. This could also be due to the unavailability of over 75% of the data for 'Cabin'

# Feature Engineering

In [None]:
train_set.dtypes

In [None]:
# Modifying DataType
#
train_set.loc[:,cat_cols] = train_set[cat_cols].astype('category',errors='ignore')
train_set.loc[:,'PassengerId'] = train_set[['PassengerId']].astype('object',errors='ignore')

In [None]:
def missing_count(data,cols=None):
  print("Number of Instances : ",len(data))
  print("Number of Missing Values in :")
  df = pd.DataFrame(data)
  if cols==None:
    cols=df.columns
  for x in cols:
    count  = df[x].isna().sum()
    if count >=1:
      print(f' - {x} : {count}({count*100/len(df):0.2f}%)')



In [None]:
missing_count(train_set)

- Embarked has 2 values missing, we could remove the entry/instance since its only 2.
- Age has 140 values missing, which constitutes about 20% of the whole data. We could impute these missing values.
- Cabin has more than 75% of missing values. Ideally we should drop this feature or find some way to extract any available information if possible.

### Feature - 'Embarked '

In [None]:
train_set.dropna(subset=['Embarked'],inplace=True)
train_set.reset_index(drop=True,inplace=True)

We shall impute the missing values in 'Age'. We had noticed that 'Pclass' had the highest correlation with 'Age', so instead of taking the median of the whole training set, we shall impute with class-wise(ticket) median age.

### Feature - 'Age'

In [None]:
pclass_avg_age = train_set.groupby(['Pclass'])['Age'].median()
pclass_avg_age

In [None]:
pd.Series(train_set.columns)

In [None]:
train_set.Age = train_set.apply((lambda x: pclass_avg_age[x[8]] if np.isnan(x[0]) else x[0]),axis=1)

In [None]:
missing_count(train_set)

'Cabin' has over 77% of its values missing, dropping the column is the ideal choice. But before droping, we shall try to extract any information if possible.

### Feature - 'Cabin'

In [None]:
print("No. of Entries available : ",train_set.Cabin.notna().sum(),'\n')
train_set.Cabin.unique()

In [None]:
# Checking if all the values in Cabin starts with an alphabet
pd.Series([str(x)[0].isalpha() if x!=np.nan else False for x in train_set.Cabin.unique()]).sum()

In [None]:
# Checking if multiple people have the same cabin/s
train_set.Cabin.value_counts()

As we can see, all of 119 unique elements starts with an alphabet. We could group the Cabin codes using this initial alphabet character.

In [None]:
shared_cabins = train_set.Cabin.value_counts()[train_set.Cabin.value_counts()>1].index
shared_cabins

In [None]:
cabins=[]
for x in train_set.Cabin.value_counts().index:
  if ' ' in x:
    cabins.extend(x.split(' '))
  else:
    cabins.append(x)
print(cabins)

In [None]:
cabin_cat = []
cabin_cat.extend([x[0] for x in cabins])
pd.Series(cabin_cat).value_counts()

In [None]:
for cabin_x in set(cabin_cat):
  train_set[f'Cabin_{cabin_x}']=[int(cabin_x in str(x)) for x in train_set.Cabin]

We could also try to group Cabins by the number of passengers in it and also by Cabins with more than 1 passenger as passengers in groups may have higher chance of survival.

In [None]:
# Categories of Cabins with more than 1 passenger.

for cabin_ in shared_cabins:
  train_set[f'Cabin_shared_{cabin_}']=[int(x==cabin_) for x in train_set.Cabin]

In [None]:
passengers_in_cabin = train_set.Cabin.value_counts()[train_set.Cabin.value_counts()>1]
passengers_in_cabin

In [None]:
for n in passengers_in_cabin.unique():
  train_set[f'{n}_Passenger_Cabin'] =  0
for index,x in enumerate(train_set.Cabin):
  if x in passengers_in_cabin.index:
    n = passengers_in_cabin[x]
    train_set.loc[index,f'{n}_Passenger_Cabin'] =  1

In [None]:
train_set.columns

We shall create a method to do the above done cleaning tasks

In [None]:
def clean_data(X):

    # Modifying DataType
    X.loc[:,cat_cols] = X[cat_cols].astype('category',errors='ignore')
    X.loc[:,'PassengerId'] = X[['PassengerId']].astype('object',errors='ignore')
    X.loc[:,num_cols] = X[num_cols].apply(lambda x: pd.to_numeric(x,errors='coerce'),axis=1)
    
    #Dropping Missing values in Embarked
    X.dropna(subset=['Embarked'],inplace=True)

    # Imputing Missing values in Age
    X.Age = X.apply((lambda x: pclass_avg_age[x[8]] if np.isnan(x[0]) else x[0]),axis=1)

    X = X.reset_index(drop=True)

    if 'Survived' in X:
      y = X.Survived
      X = X.drop(['Survived'],axis=1)

      return X,y
    else:
      return X


We could follow a similar approach to 'Ticket' as in 'Cabin'. We shall try to extract any useful information possible from Ticket column.

### Feature - 'Ticket'

In [None]:
train_set.Ticket.nunique()

In [None]:
train_set.Ticket.head(25)

In [None]:
# Checking if the initial text in String are random/unique or if it has any significance
pd.Series([str(x).split(' ')[0] if ' ' in str(x) else x for x in train_set.Ticket]).value_counts()

The tickets seems much more random at first glance apart from the fact that they are mostly numerical or numericals preceeded by some text. Individuals travelling together will have the same ticket code.

In [None]:
ticket_codes=[]
for x in train_set.Ticket.value_counts().index:
  if ' ' in x:
    ticket_codes.append(x.split(' ')[0])
print(ticket_codes)

In [None]:
ticket_codes = [x.replace('.','') for x in ticket_codes]
ticket_pattern_uniq = pd.Series(ticket_codes).unique()
pd.Series(ticket_codes).value_counts()

In [None]:
for x in ticket_pattern_uniq:
  train_set['Ticket_'+x] = [int(x == str(y).split(' ')[0].replace('.','')) for y in train_set.Ticket]

In [None]:
shared_tickets = train_set.Ticket.value_counts()[train_set.Ticket.value_counts()>1]
shared_tickets

In [None]:
for n in shared_tickets.unique():
  train_set[f'{n}_Passenger_Ticket'] =  0
for index,x in enumerate(train_set.Ticket):
  if x in shared_tickets.index:
    n = shared_tickets[x]
    train_set.loc[index,f'{n}_Passenger_Ticket'] =  1

In [None]:
train_set.sample(5)

### Feature - 'PassengerId'

In [None]:
train_set['PassengerId']

- PassengerId column contains unique integer values only, no useful information can be extracted from them. Dropping is ideal.

In [None]:
train_set['Name']

- For passenger 'Name', all seems to have a 'Title'.

In [None]:
train_set['Title'] = train_set['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])
print(train_set['Title'].unique())
train_set['Title'].nunique()

- All passengers have a title in their name and to be specific there are 14 titles. 
- Mlle is French for Ms, so we shall replace this.

In [None]:
train_set['Title'] = train_set['Title'].replace(['Mlle'],['Ms'])
print(train_set['Title'].unique())
train_set['Title'].nunique()

In [None]:
pd.crosstab(train_set['Survived'],train_set['Title'])

In [None]:
fig,ax = plt.subplots(figsize=(12,5))
sns.countplot(hue='Survived',data=train_set,x='Title',)
plt.tight_layout()

We could also add a feature of Family Size

### Feature - 'SibSp' & 'Parch'

In [None]:
train_set['FamilySize'] = train_set.SibSp +	train_set.Parch
train_set['FamilySize'].unique()

We have now extracted information from the features 'Cabin','Ticket' and 'Name' and now we shall drop these columns along with 'PassengerId'.


In [None]:
train_set.drop(['Cabin','Ticket','PassengerId','Name'],axis=1,inplace=True)

In [None]:
cat_cols_updated = list(cat_cols)
cat_cols_updated.append('Title')
num_cols_updated = list(num_cols)
num_cols_updated.append('FamilySize')

In [None]:
for i,col in enumerate(['Title','FamilySize']):
  j=0
  fig,ax = plt.subplots(1,4,figsize=(24,4),)
  for col1 in ['Pclass','Sex','Embarked','Survived']:
      sns.countplot(x=col,data=train_set,hue=col1,ax=ax[j])
      j=j+1
  
  fig.suptitle(col,size=16)
  fig.tight_layout()

We will create a Custom Transformer to extract/create new features

In [None]:
from sklearn.base import TransformerMixin,BaseEstimator

In [None]:
class FeatureEngineering(TransformerMixin,BaseEstimator):

  def __init__(self):
    self
  
  def fit(self,X,y=None):
    return self

  def transform(self,X,y=None):

    X = X.reset_index(drop=True)

    # Creating Feature 'Title'
    X['Title'] = X['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])
    X['Title'] = X['Title'].replace(['Mlle'],['Ms'])

    # Creating Feature 'FamilySize'
    X['FamilySize'] = X.SibSp +	X.Parch


    #cabins=[]
    #cabin_cat = []
    #for x in X.Cabin.value_counts().index:
    #  if ' ' in x:
    #    cabins.extend(x.split(' '))
    #  else:
    #    cabins.append(x)
    #cabin_cat.extend([x[0] for x in cabins])
    for cabin_x in set(cabin_cat):
      X[f'Cabin_{cabin_x}']=[int(cabin_x in str(x)) for x in X.Cabin]

    #shared_cabins = X.Cabin.value_counts()[X.Cabin.value_counts()>1].index
    for cabin_ in shared_cabins:
      X[f'Cabin_shared_{cabin_}']=[int(x==cabin_) for x in X.Cabin]

    #passengers_in_cabin = X.Cabin.value_counts()[X.Cabin.value_counts()>1]
    for n in passengers_in_cabin.unique():
      X[f'{n}_Passenger_Cabin'] =  0
    for index,x in enumerate(X.Cabin):
      if x in passengers_in_cabin.index:
        n = passengers_in_cabin[x]
        X.loc[index,f'{n}_Passenger_Cabin'] =  1


    #ticket_codes=[]
    #for x in X.Ticket.value_counts().index:
    #  if ' ' in x:
    #    ticket_codes.append(x.split(' ')[0])
    #ticket_codes = [x.replace('.','') for x in ticket_codes]
    #ticket_pattern_uniq = pd.Series(ticket_codes).unique()
    for x in ticket_pattern_uniq:
      X['Ticket_'+x] = [int(x == str(y).split(' ')[0].replace('.','')) for y in X.Ticket]

    for ticket_ in shared_tickets.index:
      X[f'Ticket_shared_{ticket_}']=[int(x==ticket_) for x in X.Ticket]

    #shared_tickets = X.Ticket.value_counts()[X.Ticket.value_counts()>1]
    for n in shared_tickets.unique():
      X[f'{n}_Passenger_Ticket'] =  0
    for index,x in enumerate(X.Ticket):
      if x in shared_tickets.index:
        n = shared_tickets[x]
        X.loc[index,f'{n}_Passenger_Ticket'] =  1

    X = X.drop(['PassengerId','Name','Ticket', 'Cabin'],axis=1)

    return X


In [None]:
# A custom transformer to view the data inbetween the various stages of the pipeline
class TransformationSubStage(TransformerMixin,BaseEstimator):

  def __init__(self):
    self
    self.transformed_X = None
    self.transformed_y = None
  
  def fit(self,X,y=None):
    return self

  def transform(self,X,y=None):
    self.transformed_X = X
    self.transformed_y = y
    return X

### Building a Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [None]:
sub_pipe1 = Pipeline([
                      ('imputer',SimpleImputer(strategy='most_frequent')),
                      (('ohe',OneHotEncoder(handle_unknown='ignore')))
])

In [None]:
coltransformer = ColumnTransformer([
                                    ('num_impute',SimpleImputer(strategy='median'),['Age', 'SibSp', 'Parch', 'FamilySize']),
                                    ('num_impute2',SimpleImputer(strategy='mean'),['Fare']),
                                    ('cat_impute',sub_pipe1,['Sex', 'Embarked', 'Pclass', 'Title'])
],remainder='passthrough')

In [None]:
pipe = Pipeline([
                 ('feat_engg',FeatureEngineering()),
                 ('substage_feat_engg',TransformationSubStage()),
                 ('coltransformer',coltransformer),
                 ('substage_coltransformer',TransformationSubStage()),
                 ('num',StandardScaler()),
])

In [None]:
X_train,y_train = clean_data(train_original)
X_train = pipe.fit_transform(X_train)

In [None]:
X_train.shape

In [None]:
X_test,y_test = clean_data(test_set)
X_test = pipe.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.svm import SVC,LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV

In [None]:
models = {}

### Logistic Regression

In [None]:
logreg_gridSearch = LogisticRegressionCV(solver='saga',penalty='elasticnet',Cs=[0.1,0.2,0.5,1,10,15,20,25,50,100,100],l1_ratios=[0,0.35,0.5,0.65,1],n_jobs=-1,cv=3,random_state=0)
logreg_gridSearch.fit(X_train,y_train)

In [None]:
logreg = LogisticRegression(solver='saga',penalty='elasticnet',C=logreg_gridSearch.C_[0],l1_ratio=logreg_gridSearch.l1_ratio_[0],n_jobs=-1,random_state=0)
logreg.fit(X_train,y_train)

In [None]:
accuracy = accuracy_score(y_test,logreg.predict(X_test))
accuracy

In [None]:
models['Logistic Regression'] = accuracy

### Linear SVC

In [None]:
params ={'C':[0.01,0.1,1,2,5,10,20,50,100,1000],
         'penalty':['l1','l2']}
lin_svc = GridSearchCV(LinearSVC(random_state=0),params)

In [None]:
lin_svc.fit(X_train,y_train)
lin_svc.best_params_

In [None]:
lin_svc = lin_svc.best_estimator_

In [None]:
accuracy = accuracy_score(y_test,lin_svc.predict(X_test))
accuracy

In [None]:
models['Linear SVC'] = accuracy

### SVC

In [None]:
params ={'C':[0.01,0.1,1,2,5,10,20,50,100,1000],
         'kernel':['rbf','sigmoid']}
svc = GridSearchCV(SVC(random_state =0,probability=True),params)

In [None]:
svc.fit(X_train,y_train)

In [None]:
svc.best_params_

In [None]:
svc = svc.best_estimator_

In [None]:
accuracy = accuracy_score(y_test,svc.predict(X_test))
accuracy

In [None]:
models['SVC'] = accuracy

### Decision Tree Classifier

In [None]:
dt_clf = DecisionTreeClassifier(random_state =0)
dt_clf.fit(X_train,y_train)

In [None]:
accuracy = accuracy_score(y_test,dt_clf.predict(X_test))
accuracy

In [None]:
models['Decision Tree'] = accuracy

### Random Forest Classifier (Ensemble)

In [None]:
!pip install -q optuna

In [None]:
import optuna

In [None]:
def objective(trial):

  max_features=trial.suggest_float('max_features',0.3,1,step=0.05)
  max_samples=trial.suggest_float('max_samples',0.3,0.95,step=0.05)
  min_samples_split=trial.suggest_float('min_samples_split',0.01,0.11,step=0.01)
  class_weight=trial.suggest_categorical('class_weight',['balanced', 'balanced_subsample',None])

  clf = RandomForestClassifier(max_features=max_features, max_samples=max_samples, min_samples_split=min_samples_split,class_weight=class_weight,random_state =0)

  return cross_val_score(clf,X_train,y_train,cv=3,n_jobs=-1,scoring='accuracy').mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=50)

In [None]:
best_trial = study.best_trial
print("Accuracy : ",best_trial.value)
best_trial.params

In [None]:
rf_clf = RandomForestClassifier(**best_trial.params,random_state =0)
rf_clf.fit(X_train,y_train)

In [None]:
accuracy = accuracy_score(y_test,rf_clf.predict(X_test))
accuracy

In [None]:
models['Random Forest'] = accuracy

### K-Nearest Neighbor Classifier

In [None]:
params = {'n_neighbors' : [2,3,4,5,6,7,8,9,10]}
knn_clf = GridSearchCV(KNeighborsClassifier(), params)

In [None]:
knn_clf.fit(X_train,y_train)

In [None]:
knn_clf.best_params_

In [None]:
accuracy = accuracy_score(y_test,knn_clf.predict(X_test))
accuracy

In [None]:
models['K-Nearest Neighbor'] = accuracy

### Gaussian Naive Bayes Classifier

In [None]:
params = {'var_smoothing': np.logspace(0,-9, num=100)}
nb_clf = GridSearchCV(GaussianNB(), params)

In [None]:
nb_clf.fit(X_train,y_train)

In [None]:
nb_clf.best_params_

In [None]:
accuracy = accuracy_score(y_test,nb_clf.predict(X_test))
accuracy

In [None]:
models['Gaussian Naive Bayes'] = accuracy

In [None]:
import xgboost as xgb

In [None]:
cv = cross_val_score(xgb.XGBClassifier(),X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
tst = xgb.XGBClassifier().fit(X_train,y_train)
accuracy_score(y_test,tst.predict(X_test))

In [None]:
n_iter = []
def objective_xgb(trial):

  params = {
      'learning_rate' : trial.suggest_loguniform('learning_rate',1e-8,0.5),
      'max_depth' : trial.suggest_int('max_depth',8,33),
      'subsample' : trial.suggest_float('subsample',0.5,1),
      'colsample_bynode' : trial.suggest_float('colsample_bynode',0.5,1),
      'lambda' : trial.suggest_loguniform("lambda", 1e-8, 1.0),
      'alpha': trial.suggest_loguniform("alpha", 1e-8, 1.0),
      'gamma' : trial.suggest_loguniform("gamma", 1e-8, 1.0),
                
      'objective':'binary:logistic','random_state':0
  }
  
  dtrain = xgb.DMatrix(X_train,y_train)


  cv = xgb.cv(params, dtrain, num_boost_round=1000, metrics='auc', early_stopping_rounds=50)
  n_iter.append(len(cv))

  return cv.mean()['test-auc-mean']

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_xgb,n_trials=750)

In [None]:
best_trial = study.best_trial
print("Accuracy : ",best_trial.value)
best_trial.params

In [None]:
n_iter[best_trial.number]

In [None]:
xgb_clf = xgb.XGBClassifier(**best_trial.params,n_estimators=n_iter[best_trial.number],random_state =0)
xgb_clf

In [None]:
xgb_clf.fit(X_train,y_train)

In [None]:
accuracy = accuracy_score(y_test,xgb_clf.predict(X_test))
accuracy

In [None]:
models['XGBoost'] = accuracy

In [None]:
models

### AdaBoost Classifier (Ensemble)

In [None]:
adaboost_base = AdaBoostClassifier(random_state=0)
adaboost_base.fit(X_train,y_train)
accuracy = accuracy_score(y_test,adaboost_base.predict(X_test))
accuracy

In [None]:
def objective_adaboost(trial):

  params = {
      'n_estimators':trial.suggest_int('n_estimators',2,200),
      'learning_rate' : trial.suggest_loguniform('learning_rate',1e-6,0.5)
  }

  clf = AdaBoostClassifier(**params,random_state=0)

  cv_score = cross_val_score(clf,X_train, y_train , scoring='accuracy', cv=3, n_jobs=-1,)

  return cv_score.mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_adaboost,100)

In [None]:
best_trial = study.best_trial
best_trial.params

In [None]:
adaboost_clf = AdaBoostClassifier(**best_trial.params,random_state =0).fit(X_train,y_train)
accuracy = accuracy_score(y_test,adaboost_clf.predict(X_test))
accuracy
# 0.8379888268156425

In [None]:
models['AdaBoost Classifier'] = accuracy
models

### Voting Classifier (Ensemble)

In [None]:
votting_clf = VotingClassifier([('Linear SVC',lin_svc),('Logistic Regression',logreg),('SVC',svc),('Random Forest',rf_clf),('K-Nearest Neighbor',knn_clf)],n_jobs=-1)
votting_clf.fit(X_train,y_train)

In [None]:
accuracy = accuracy_score(y_test,votting_clf.predict(X_test))
accuracy

In [None]:
votting_clf2 = VotingClassifier([('Logistic Regression',logreg),('SVC',svc),('Random Forest',rf_clf),('K-Nearest Neighbor',knn_clf)],voting='soft',n_jobs=-1)
votting_clf2.fit(X_train,y_train)

In [None]:
accuracy = accuracy_score(y_test,votting_clf2.predict(X_test))
accuracy

In [None]:
votting_clf3 = VotingClassifier([('SVC',svc),('AdaBoost',adaboost_clf),('XGBoost',xgb_clf),],voting='soft',n_jobs=-1)
votting_clf3.fit(X_train,y_train)

In [None]:
accuracy = accuracy_score(y_test,votting_clf3.predict(X_test))
accuracy

In [None]:
test_data_raw.shape

In [None]:
test_data_raw.describe()

In [None]:
X_test_data = clean_data(test_data_raw)
X_test_data = pipe.transform(X_test_data)

In [None]:
vot_clf1_result = votting_clf.predict(X_test_data).astype(int)
vot_clf2_result = votting_clf2.predict(X_test_data).astype(int)
vot_clf3_result = votting_clf3.predict(X_test_data).astype(int)

In [None]:
svc_result = svc.predict(X_test_data).astype(int)
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': svc_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_svc.csv', index =False)

In [None]:
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': vot_clf1_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_vot_clf1.csv', index =False)

In [None]:
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': vot_clf2_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_vot_clf2.csv', index =False)

In [None]:
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': vot_clf3_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_vot_clf3.csv', index =False)

In [None]:
logreg_result = logreg.predict(X_test_data).astype(int)
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': logreg_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_logreg.csv', index =False)

In [None]:
lin_SVC_result = lin_svc.predict(X_test_data).astype(int)
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': lin_SVC_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_linSVC.csv', index =False)

In [None]:
xgboost_result = xgb_clf.predict(X_test_data).astype(int)
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': xgboost_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_xgboost.csv', index =False)

In [None]:
adaboost_result = adaboost_clf.predict(X_test_data).astype(int)
sub_data = {'PassengerId': test_data_raw.PassengerId, 'Survived': adaboost_result}
submission = pd.DataFrame(data=sub_data)
submission.to_csv('submission_adaboost_result.csv', index =False)

In [None]:
#!kaggle competitions submit -c titanic -f submission_adaboost_result.csv -m "AdaBoost"