In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer , make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
import joblib
import warnings
warnings. filterwarnings('ignore')

In [3]:
df = pd.read_csv('train.csv')
X = df.drop('Survived', axis=1) 
y = df.Survived

In [4]:
numeric_features = ['Pclass','Age','SibSp','Parch','Fare']
text_features = ['Sex','Embarked']
columns_to_drop = ["PassengerId", "Ticket", "Cabin", "Name"]

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

text_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('text', text_transformer, text_features),
        ('drop', 'drop', columns_to_drop)
    ])

In [6]:
import sklearn

sklearn.__version__

'1.3.0'

In [7]:
classifier = XGBClassifier(eta=0.1, max_depth=4)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [9]:
model.fit(X_train, y_train)

In [10]:
y_hat = model.predict(X_test)

print(accuracy_score(y_test, y_hat))
print(precision_score(y_test, y_hat))

0.8111111111111111
0.7714285714285715


In [12]:
joblib.dump(model, 'pipe.joblib') 

['pipe.joblib']

In [13]:
loaded_model = joblib.load('pipe.joblib')

loaded_model