In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


In [6]:
file_path=r"https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv"
data=pd.read_csv(file_path)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# Excluding the unnessary columns/Feature
data.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [17]:
# Define pipelines for different feature types
# For numeric features
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# For categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

# ColumnTransformer combining both
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, ['Age', 'Fare']),
    ('cat', categorical_pipeline, ['Sex', 'Embarked'])
], remainder='passthrough')  # Pclass, SibSp, Parch

# Final pipeline
model = Pipeline([
    ('preprocess', preprocessor),
    ('clf', SVC(kernel='rbf'))
])


In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Survived']),data['Survived'],test_size=0.2,random_state=42)

In [18]:
from sklearn.metrics import accuracy_score
y_pred=model.fit(X_train,y_train).predict(X_test)
accuracy_score(y_test,y_pred)

0.7988826815642458

In [19]:
import pickle

pickle.dump(model,open('svc_model.pkl','wb'))

In [21]:
from pickle import load
model_path=r"C:\Users\Administrator\OneDrive\Desktop\Machine_learning_cs303\Practicals\prac_ml_pipline\svc_model.pkl"
pipe=load(open(model_path,'rb'))


# Construct test input with correct types
test_input = pd.DataFrame({
    'Pclass': [3],
    'Sex': ['male'],
    'Age': [22.0],
    'SibSp': [1],
    'Parch': [0],
    'Fare': [7.25],
    'Embarked': ['S']
})

# Now this should work with your pipeline
print("Survived") if pipe.predict(test_input) else print("Unsurvived")

Unsurvived
