# Model Evaluation for Classification  Problems

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

## I. Loading the dataset

In [2]:
# Load the dataset
df = pd.read_csv('data/titanic_data.csv')
# Limit to numeric data
df = df._get_numeric_data()
# dropping rows with missing data
df = df.dropna()

In [3]:
# set data parts aside
target = 'Survived'
X = df.drop(columns=[target])
X = X.drop(['PassengerId'], axis=1)
# Separate the labels
y = df[target]

In [4]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
553,554,1,3,22.0,0,0,7.225
405,406,0,2,34.0,1,0,21.0
6,7,0,1,54.0,0,0,51.8625
177,178,0,1,50.0,0,0,28.7125
856,857,1,1,45.0,1,1,164.8667


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Age          714 non-null    float64
 4   SibSp        714 non-null    int64  
 5   Parch        714 non-null    int64  
 6   Fare         714 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 44.6 KB


## II. Modeling
### II.1. Train-test split

In [6]:
# X: input
# y: output (labels) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('Shape of TRAIN set: ', X_train.shape)
print('Shape of TEST set: ', X_test.shape)


Shape of TRAIN set:  (478, 5)
Shape of TEST set:  (236, 5)


### II.2. Train the model

In [9]:
# logistic regression
clf = LogisticRegression()
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### II.3. Testing the model's performance

In [10]:
prediction = clf.predict(X_test)

#### Confusion Matrix

In [11]:
print(confusion_matrix(y_test, prediction, labels=[0,1]))

[[120  17]
 [ 50  49]]


#### AUC


