In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier , GradientBoostingClassifier , VotingClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import set_config
set_config(display='diagram')

In [2]:
file_path=r"https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv"
df=pd.read_csv(file_path)
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Excluding the unnessary columns/Feature
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [34]:
svm1 = SVC(probability=True, kernel='poly', degree=1)
svm2 = SVC(probability=True, kernel='poly', degree=2)
svm3 = SVC(probability=True, kernel='poly', degree=3)
rf1=RandomForestClassifier(n_estimators=100)

rf2=RandomForestClassifier(n_estimators=100,bootstrap=True)


estimators1 = [('svm1',svm1),('svm2',svm2),('svm3',svm3),('randomforest1',rf1),('randomForestwithreplacement',rf2)]

In [43]:
# defining  pipline for different Features

# For numeric
numeric_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scalar',StandardScaler())
])

# Categorical 
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encoding',OneHotEncoder(drop='first',sparse_output=False))

])

# Preporcessing

preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipeline, ['Age', 'Fare']),
        ('categoric', categorical_pipeline, ['Sex', 'Embarked'])
    ],
    remainder='passthrough'  # keep other columns (e.g., Pclass, SibSp, Parch)
)
# ColumnTransformer: "I handle which columns go where"
# Pipeline: "I handle what transformations to apply to a specific column subset"

# model pipline 

adaboost=Pipeline(steps=[
    ("preprocessing",preprocessing_pipeline),
    ("classifier",AdaBoostClassifier(n_estimators=100,algorithm='SAMME.R'))
])
gradientboost=Pipeline(steps=[
    ("preprocessing",preprocessing_pipeline),
    ("classifier",GradientBoostingClassifier(n_estimators=1000,learning_rate=0.01))
])


voting=Pipeline(steps=[
    ("preprocessing",preprocessing_pipeline),
    ("classifier",VotingClassifier(estimators=estimators1,voting='soft'))

])


In [44]:
X_train , X_test , y_train , y_test = train_test_split(df.drop(columns='Survived'),df["Survived"],test_size=0.19,random_state=42)

In [45]:
from sklearn.metrics import confusion_matrix , accuracy_score
adaboost.fit(X_train,y_train)
gradientboost.fit(X_train,y_train)
voting.fit(X_train,y_train)
print(f" Adaboost accuracy: {accuracy_score(y_test,adaboost.predict(X_test))}")
print(f" GradientBoost accuracy: {accuracy_score(y_test,voting.predict(X_test))}")
print(f" Voting accuracy: {accuracy_score(y_test,voting.predict(X_test))}")



 Adaboost accuracy: 0.8294117647058824
 GradientBoost accuracy: 0.8176470588235294
 Voting accuracy: 0.8176470588235294
