# Simple Modeling Pipeline

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import  pickle

## I. Loading the dataset

In [29]:
# Load the dataset
df = pd.read_csv('data/titanic_data.csv')
# Limit to numeric data
df = df._get_numeric_data()
# dropping rows with missing data
df = df.dropna()

In [30]:
# set data parts aside
X = df.drop(columns=['Survived'])
X = X.drop(['PassengerId'], axis=1)
# Separate the labels
y = df['Survived']

In [31]:
X.sample(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
861,2,21.0,1,0,11.5
278,3,7.0,4,1,29.125
174,1,56.0,0,0,30.6958
387,2,36.0,0,0,13.0
24,3,8.0,3,1,21.075


In [32]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  714 non-null    int64  
 1   Age     714 non-null    float64
 2   SibSp   714 non-null    int64  
 3   Parch   714 non-null    int64  
 4   Fare    714 non-null    float64
dtypes: float64(2), int64(3)
memory usage: 33.5 KB


## II. Modeling
### II.1. Train-test split

In [33]:
# X: input
# y: output (labels) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('Shape of TRAIN set: ', X_train.shape)
print('Shape of TEST set: ', X_test.shape)


Shape of TRAIN set:  (478, 5)
Shape of TEST set:  (236, 5)


train_test_split returns the following:
- `X_train`: The training input
- `X_test`: The testing input
- `y_train`: The training labels
- `y_test`: The testing labels


In [34]:
#see some samples
X_train.sample(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
614,3,35.0,0,0,8.05
712,1,48.0,1,0,52.0
441,3,20.0,0,0,9.5


In [35]:
y_train.sample(3)

526    1
494    0
216    1
Name: Survived, dtype: int64

### II.2. Train the model
#### 1. Definine the classifier 

In [36]:
# logistic regression
clf = LogisticRegression()


#### 2. Fit the classifier to the data 

We fit our model on the feature set that we have 

In [37]:
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### II.3. Testing the model's performance (on the test set)
#### 1. Predict results 

In [39]:
prediction = clf.predict(X_test)

#### 2. Check the performance 

In [40]:
print(confusion_matrix(y_test, prediction, labels=[0,1]))

[[120  17]
 [ 50  49]]


## III. Save & load model
### III.1. Save model

In [41]:
# saves model into memory
s = pickle.dumps(clf)

file_name = 'model_name.sav'
# saves model to disk
# wb: open for writing and open in binary mode
pickle.dump(clf,  open(file_name, 'wb'))

### III.2. Load model from disk

In [42]:
# rb: onpe for reading and in binary mode
loaded_model = pickle.load(open(file_name, 'rb'))
prediction_loaded = loaded_model.predict(X_test)
print(confusion_matrix(y_test, prediction_loaded, labels=[0,1]))

[[120  17]
 [ 50  49]]
