In [102]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import pickle

In [79]:
data = pd.read_csv("titanic.csv")
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [80]:
data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 285.6 KB


In [81]:
y = data["Survived"]
x = data.drop("Survived", axis=1)

In [82]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [83]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((668, 11), (223, 11), (668,), (223,))

In [84]:
def preprocess(data):
    data = data.copy()
    
    data_info={}
    data.drop("PassengerId", axis=1, inplace=True)
    data_info["Pclass.value_counts"] = data.Pclass.value_counts()
    data["Pclass"] = data["Pclass"].astype("int8")
    data.drop("Name", axis=1, inplace=True)
    data_info["Sex"] = data.Sex.value_counts()
    data["Sex"] = (data["Sex"] == "male").astype("int8")
    age_med = data["Age"].median()
    data_info["Age_median"] = age_med
    data["Age"] = data["Age"].fillna(age_med).astype("int8")
    data_info["SibSp"] = data.SibSp.value_counts()
    data_info["Parch"] = data.Parch.value_counts()
    data["SibSp"] = data["SibSp"].astype("int8")
    data["Parch"] = data["Parch"].astype("int8")
    data.drop(["Ticket", "Cabin"], axis=1, inplace=True)
    data["Fare"] = np.log(data.Fare + 1).astype("float32")
    data_info["Embarked"] = data.Embarked.value_counts()
    data["Embarked"] = data.Embarked.fillna("S")
    d = data["Embarked"].value_counts()
    data_info["Embarked.value_counts"] = d
    for k in d.keys():
        data["Embarked_" + k] = (data["Embarked"] == k).astype("int8")
    data.drop("Embarked", axis=1, inplace=True)

    return data, data_info

In [85]:
x_train_p, data_info = preprocess(x_train)

In [86]:
x_train_p

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_S,Embarked_C,Embarked_Q
105,3,1,28,0,0,2.185579,1,0,0
68,3,0,17,4,2,2.188856,1,0,0
253,3,1,30,1,0,2.839078,1,0,0
320,3,1,22,0,0,2.110213,1,0,0
706,2,0,45,0,0,2.674149,1,0,0
...,...,...,...,...,...,...,...,...,...
835,1,0,39,1,1,4.432700,0,1,0
192,3,0,19,1,0,2.180892,1,0,0
629,3,1,29,0,0,2.167143,0,0,1
559,3,0,36,1,0,2.912351,1,0,0


In [87]:
def prepreprocess_test(data, data_info):
    data = data.copy()
    data.drop("PassengerId", axis=1, inplace=True)
    data["Pclass"] = data["Pclass"].astype("int8")
    data.drop("Name", axis=1, inplace=True)
    data["Sex"] = (data["Sex"] == "male").astype("int8")
    data["Age"] = data["Age"].fillna(data_info["Age_median"]).astype("int8")
    data["SibSp"] = data["SibSp"].astype("int8")
    data["Parch"] = data["Parch"].astype("int8")
    data.drop(["Ticket", "Cabin"], axis=1, inplace=True)
    data["Fare"] = np.log(data.Fare + 1).astype("float32")
    data["Embarked"] = data.Embarked.fillna("S")

    for k in data_info["Embarked.value_counts"].keys():
        data["Embarked_" + k] = (data["Embarked"] == k).astype("int8")
    data.drop("Embarked", axis=1, inplace=True)

    return data

In [88]:
x_test_p = prepreprocess_test(x_test, data_info)

In [89]:
x_test_p

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_S,Embarked_C,Embarked_Q
495,3,1,29,0,0,2.738146,0,1,0
648,3,1,29,0,0,2.145931,1,0,0
278,3,1,7,4,1,3.405355,0,0,1
31,1,0,29,1,0,4.993969,0,1,0
255,3,0,29,0,2,2.787834,0,1,0
...,...,...,...,...,...,...,...,...,...
167,3,0,45,1,4,3.363842,1,0,0
306,1,0,29,0,0,4.717456,0,1,0
379,3,1,19,0,0,2.171907,1,0,0
742,1,0,21,2,2,5.573579,0,1,0


In [90]:
scaler = MinMaxScaler()
x_train_sc = pd.DataFrame(scaler.fit_transform(x_train_p), columns = x_train_p.columns, index = x_train_p.index)
x_test_sc = pd.DataFrame(scaler.transform(x_test_p), columns = x_test_p.columns, index = x_test_p.index)

In [91]:
data_info["scaler"] = scaler

In [92]:
x_train_sc

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_S,Embarked_C,Embarked_Q
105,1.0,1.0,0.3500,0.000,0.000000,0.350202,1.0,0.0,0.0
68,1.0,0.0,0.2125,0.500,0.333333,0.350727,1.0,0.0,0.0
253,1.0,1.0,0.3750,0.125,0.000000,0.454914,1.0,0.0,0.0
320,1.0,1.0,0.2750,0.000,0.000000,0.338125,1.0,0.0,0.0
706,0.5,0.0,0.5625,0.000,0.000000,0.428486,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
835,0.0,0.0,0.4875,0.125,0.166667,0.710264,0.0,1.0,0.0
192,1.0,0.0,0.2375,0.125,0.000000,0.349451,1.0,0.0,0.0
629,1.0,1.0,0.3625,0.000,0.000000,0.347248,0.0,0.0,1.0
559,1.0,0.0,0.4500,0.125,0.000000,0.466654,1.0,0.0,0.0


In [93]:
x_test_sc

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_S,Embarked_C,Embarked_Q
495,1.0,1.0,0.3625,0.000,0.000000,0.438741,0.0,1.0,0.0
648,1.0,1.0,0.3625,0.000,0.000000,0.343849,1.0,0.0,0.0
278,1.0,1.0,0.0875,0.500,0.166667,0.545650,0.0,0.0,1.0
31,0.0,0.0,0.3625,0.125,0.000000,0.800198,0.0,1.0,0.0
255,1.0,0.0,0.3625,0.000,0.333333,0.446703,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
167,1.0,0.0,0.5625,0.125,0.666667,0.538998,1.0,0.0,0.0
306,0.0,0.0,0.3625,0.000,0.000000,0.755892,0.0,1.0,0.0
379,1.0,1.0,0.2375,0.000,0.000000,0.348011,1.0,0.0,0.0
742,0.0,0.0,0.2625,0.250,0.333333,0.893070,0.0,1.0,0.0


In [94]:
model = LogisticRegression()
model.fit(x_train_sc, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [95]:
y_pred_test = model.predict(x_test_sc)
y_pred_test

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1])

In [96]:
y_test.values

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1])

In [97]:
(y_test.values == y_pred_test).sum() / len(y_test)

np.float64(0.7937219730941704)

In [98]:
accuracy_score(y_test.values, y_pred_test)

0.7937219730941704

In [99]:
data_info["model"] = model

In [100]:
data_info

{'Pclass.value_counts': Pclass
 3    367
 1    163
 2    138
 Name: count, dtype: int64,
 'Sex': Sex
 male      437
 female    231
 Name: count, dtype: int64,
 'Age_median': np.float64(29.0),
 'SibSp': SibSp
 0    458
 1    153
 2     23
 4     13
 3     10
 8      7
 5      4
 Name: count, dtype: int64,
 'Parch': Parch
 0    507
 1     85
 2     64
 3      5
 5      4
 4      2
 6      1
 Name: count, dtype: int64,
 'Embarked': Embarked
 S    490
 C    116
 Q     60
 Name: count, dtype: int64,
 'Embarked.value_counts': Embarked
 S    492
 C    116
 Q     60
 Name: count, dtype: int64,
 'scaler': MinMaxScaler(),
 'model': LogisticRegression()}

In [101]:
with open("titanic_data_info.pck", "wb") as f:
    pickle.dump(data_info, f)