# Basic Models (Decision Tree, Bagging Regressor, Random Forest)

## One last bit of data cleaning, just removing one column that won't help here.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv("updated.csv", low_memory=False)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,True,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,True,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,True,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,False,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [2]:
df = df.drop('Ticket', axis=1)
df = df.drop('Name', axis=1)

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,False,3,male,22.0,1,0,7.25,S
1,True,1,female,38.0,1,0,71.2833,C
2,True,3,female,26.0,0,0,7.925,S
3,True,1,female,35.0,1,0,53.1,S
4,False,3,male,35.0,0,0,8.05,S


## Double check data

### Dtypes

In [4]:
df.dtypes

Survived       bool
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

Survived boolean isn't good for models, switch to object

In [5]:
df['Survived'] = df['Survived'].astype(object)
df.dtypes

Survived     object
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [6]:
df['Sex'].value_counts()

male      577
female    312
Name: Sex, dtype: int64

In [7]:
df['Sex'] = df['Sex'].replace('male', 'M')
df['Sex'] = df['Sex'].replace('female', 'F')
df['Sex'].value_counts()

M    577
F    312
Name: Sex, dtype: int64

In [8]:
df['Sex'] = df['Sex'].astype(object)

### NaN Values

In [9]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [10]:
df.shape

(889, 8)

We can afford to drop all the NaN aged people as well, not worth averaging or modeling them

In [11]:
df = df.dropna()
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [12]:
df.shape

(712, 8)

In [13]:
df['Survived'].value_counts()

False    424
True     288
Name: Survived, dtype: int64

## Modeling

In [14]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, RocCurveDisplay, accuracy_score, precision_score, recall_score, classification_report
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

### Train Test Split

In [15]:
X = df.drop('Survived', axis=1)
y = df['Survived']
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

### Pipelines

In [16]:
from sklearn import set_config
set_config(display='diagram')
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [17]:
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')
num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)

#### Num Pipeline

In [18]:
numeric_pipe = make_pipeline(scaler)
numeric_pipe

#### Obj Pipeline

In [19]:
categorical_pipe = make_pipeline(ohe)
categorical_pipe

#### Combine pipelines

In [20]:
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)
preprocessor = make_column_transformer(number_tuple, category_tuple)
preprocessor

#### Fit to data

In [21]:
preprocessor.fit(X_train)

In [22]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

### Function

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, ConfusionMatrixDisplay

In [27]:
def evaluate(model,X_train_processed,X_test_processed,y_train,y_test):
    print(f"Training Accuracy: {model.score(X_train, y_train):.3f}")
    print(f"Test Accuracy: {model.score(X_test, y_test):.3f}")
    print(f"Train Recall: {model.score(X_test, y_test):.3f}")
    print(f"Test Recall: {model.score(X_test, y_test):.3f}")
    print(f"Dec Tree Precision: {model.score(X_test, y_test):.3f}")
    print(f"Dummy Tree Recall: {model.score(X_test, y_test):.3f}")

In [23]:
# Check for missing values and that data is scaled and one-hot encoded
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('Shape test of data is', X_test_processed.shape)
print('Shape train of data is', X_train_processed.shape)
print('\n')

0 missing values in training data
0 missing values in testing data


All data in X_train_processed are float64
All data in X_test_processed are float64


Shape test of data is (178, 10)
Shape train of data is (534, 10)




### First Model: Logreg

In [36]:
logreg = LogisticRegression()
logreg_pipe = make_pipeline(scaler, logreg)
log = logreg_pipe.fit(X_train_processed, y_train)
print(logreg_pipe.score(X_train_processed, y_train))
print(logreg_pipe.score(X_test_processed, y_test))
log_pred = logreg.predict(X_test_processed)
log_pred = logreg.predict(X_test_processed)

0.8127340823970037
0.7808988764044944
