In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from feature_engine.selection import DropFeatures
from feature_engine.imputation import MeanMedianImputer, AddMissingIndicator
from feature_engine.encoding import OneHotEncoder

In [2]:
df = pd.read_csv("./titanic(2).csv", delimiter = ";")

In [3]:
df.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,2113375.0,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0


In [4]:
 df.isna().sum()

pclass         0
name           0
sex            0
age          263
sibsp          0
parch          0
ticket         0
fare           0
cabin       1013
embarked       2
survived       0
dtype: int64

In [5]:
df.dtypes

pclass        int64
name         object
sex          object
age         float64
sibsp         int64
parch         int64
ticket       object
fare        float64
cabin        object
embarked     object
survived      int64
dtype: object

In [6]:
X = df[["pclass","sex","age","sibsp","parch", "ticket","fare","cabin","embarked"]]
Y = df["survived"]

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
X,
Y,
test_size=0.3,
random_state=0)

In [8]:
X_train = pd.DataFrame(X_train, columns=["pclass","sex","age","sibsp","parch", "ticket","fare","cabin","embarked"])
X_test = pd.DataFrame(X_test, columns=["pclass","sex","age","sibsp","parch", "ticket","fare","cabin","embarked"])
Y_train = pd.DataFrame(Y_train, columns=["survived"])
Y_test = pd.DataFrame(Y_test, columns=["survived"])

In [9]:
df.dtypes

pclass        int64
name         object
sex          object
age         float64
sibsp         int64
parch         int64
ticket       object
fare        float64
cabin        object
embarked     object
survived      int64
dtype: object

In [10]:
NUMERICAL_VARS_WITH_NA = ["age"]
CATEGORICAL_VARS = ["sex"]
DROP_FEATURES = ["sibsp","parch","ticket","fare","cabin","embarked"]
FEATURES = ["pclass","sex","age",]

In [11]:
genero_pipe = Pipeline([
        
    # ===== IMPUTATION =====
        ("drop_features", DropFeatures(features_to_drop=DROP_FEATURES)),
        # add missing indicator
        ("missing_indicator", AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
        # impute numerical variables with the mean
        ("mean_imputation",MeanMedianImputer(
            imputation_method="mean", variables=NUMERICAL_VARS_WITH_NA
        )),
        ("OHE", OneHotEncoder(variables=CATEGORICAL_VARS, drop_last=False)),
        ("GaussianNB", GaussianNB()),
    ])

In [12]:
genero_pipe.fit(X_train, Y_train.values.ravel())

Pipeline(steps=[('drop_features',
                 DropFeatures(features_to_drop=['sibsp', 'parch', 'ticket',
                                                'fare', 'cabin', 'embarked'])),
                ('missing_indicator', AddMissingIndicator(variables=['age'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['age'])),
                ('OHE', OneHotEncoder(variables=['sex'])),
                ('GaussianNB', GaussianNB())])

In [13]:
ejemplo = pd.DataFrame([[1,1,14,2,1,1,1,1,"S"]], columns = ["pclass","sex","age","sibsp","parch", "ticket","fare","cabin","embarked"])

In [14]:
pred = genero_pipe.predict(ejemplo)
print(pred[0])


1


In [15]:
joblib.dump(genero_pipe, open('./cfk.pkl', 'wb'))