### **Assingment requirment**

- Create a Logistic Regression using the titanic dataset
-  by making a model that only uses numerical features first and predicts the "survived" column.
- Make different models utilizing the feature engineering techniques learned in class
- Compare your results between validation and test cost functions and report the percent difference between them.
- Save your model using Joblib.

### **Group Member**
- Wajd Alturki
- Lama Alzahrani
- Mashael Alhussan
- Zarah Shibli


In [None]:
import seaborn as sns
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

# Feature Engineering
from sklearn.preprocessing import OneHotEncoder

### Load data

In [None]:
df = sns.load_dataset('titanic')


In [None]:
df.sample(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
490,0,3,male,,1,0,19.9667,S,Third,man,True,,Southampton,no,False
114,0,3,female,17.0,0,0,14.4583,C,Third,woman,False,,Cherbourg,no,True
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False


In [None]:
df = df.dropna()

### Split data

In [None]:
# Split Data to train and test 
train, test = train_test_split(
    df, 
    train_size = .80,
    test_size =.20,
    random_state=42
)

# Split train data to train and val
train, val = train_test_split(
    train,
    train_size = .80,
    test_size = .20,
    random_state=42
)

In [None]:
train.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [None]:
train.who.value_counts()

man      56
woman    51
child     9
Name: who, dtype: int64

### Features Selection (Numerical features)

In [None]:
def cleaner(df):
    "Return Only Numeric Features"
    
    # Create boolean array with only columns that are float64
    columns_bool = ((df.dtypes != "category")).values
    # Return only numeric columns
    df = df.iloc[:, columns_bool].dropna()

    # object columns 
    columns_obj = ['alive', 'alone', 'embark_town', 'who', 'adult_male','embarked', 'sex']

    #drop columns 
    df = df.drop(columns_obj,axis=1)


    return df

train_numeric = cleaner(train)
val_numeric = cleaner(val)

In [None]:
train_numeric.sample(3)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
772,0,2,57.0,0,0,10.5
311,1,1,18.0,2,2,262.375
681,1,1,27.0,0,0,76.7292


### split data to X, y

In [None]:
# Create X, y train and validation sets

target = "survived"

X_train = train_numeric.drop(target, axis=1)
y_train = train_numeric[target]

X_val = val_numeric.drop(target, axis=1)
y_val = val_numeric[target]

### Model (Numaric features)

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_train,y_train)

0.6724137931034483

In [None]:
pred_val = lr.predict(X_val)


In [None]:
print("Results for val")
print("Accuracy: ",accuracy_score(y_val, pred_val))
print("Recall: ",recall_score(y_val, pred_val))
print("Precision: ",precision_score(y_val, pred_val))

Results for val
Accuracy:  0.7241379310344828
Recall:  1.0
Precision:  0.7142857142857143


### Test model

In [None]:
test_numeric = cleaner(test)

X_test = test_numeric.drop(target, axis=1)
y_test = test_numeric[target]

pred_test = lr.predict(X_test)

print("Results for Test")
print("Accuracy: ",accuracy_score(y_test, pred_test))
print("Recall: ",recall_score(y_test, pred_test))
print("Precision: ",precision_score(y_test, pred_test))

Results for val
Accuracy:  0.7567567567567568
Recall:  1.0
Precision:  0.7272727272727273


### Features Enginering

In [None]:
# Create our One Hot Encoder object
one_hot = OneHotEncoder()

col_names = ["who", "embark_town"]

# One Hot encode the column
one_hot_df = one_hot.fit_transform(train[col_names]).toarray()
one_hot_df_val = one_hot.transform(val[col_names]).toarray()

one_hot_df_test = one_hot.transform(test[col_names]).toarray()

In [None]:
# look at categories
one_hot.categories_

[array(['child', 'man', 'woman'], dtype=object),
 array(['Cherbourg', 'Queenstown', 'Southampton'], dtype=object)]

In [None]:
# Create column names list for one hot encoded features
column_names = []

for y in range(len(one_hot.categories_)):
    for z in range(len(one_hot.categories_[y])):
        column_names.append(col_names[y]+"_"+one_hot.categories_[y][z])


column_names

['who_child',
 'who_man',
 'who_woman',
 'embark_town_Cherbourg',
 'embark_town_Queenstown',
 'embark_town_Southampton']

In [None]:
# create dataframe for oneHot 
oh_df = pd.DataFrame(
    one_hot_df,
    index=train.index,
    columns = column_names
)

oh_df_val = pd.DataFrame(
    one_hot_df_val,
    index=val.index,
    columns = column_names
)


oh_df_test = pd.DataFrame(
    one_hot_df_test,
    index=test.index,
    columns = column_names
)

In [None]:
train_new = X_train.merge(
    oh_df, # Dataframe to merge with train
    on=X_train.index # column to use to merge on
).set_index("key_0")

# Doing the same thing as above but to the validation data
val_new = X_val.merge(oh_df_val, on=X_val.index).set_index("key_0")

test_new = X_test.merge(oh_df_test, on=X_test.index).set_index("key_0")


In [None]:
train_new

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
581,1,39.0,1,1,110.8833,0.0,0.0,1.0,1.0,0.0,0.0
248,1,37.0,1,1,52.5542,0.0,1.0,0.0,0.0,0.0,1.0
621,1,42.0,1,0,52.5542,0.0,1.0,0.0,0.0,0.0,1.0
309,1,30.0,0,0,56.9292,0.0,0.0,1.0,1.0,0.0,0.0
823,3,27.0,0,1,12.4750,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
689,1,15.0,0,1,211.3375,1.0,0.0,0.0,0.0,0.0,1.0
356,1,22.0,0,1,55.0000,0.0,0.0,1.0,0.0,0.0,1.0
772,2,57.0,0,0,10.5000,0.0,0.0,1.0,0.0,0.0,1.0
205,3,2.0,0,1,10.4625,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
lr2 = LogisticRegression()
lr2.fit(train_new, y_train)
lr2.score(train_new, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8275862068965517

### Predict

In [None]:
pre_val_2 = lr2.predict(val_new)
print("Results for val - Features Enginering")
print("Accuracy: ",accuracy_score(y_val, pre_val_2))
print("Recall: ",recall_score(y_val, pre_val_2))
print("Precision: ",precision_score(y_val, pre_val_2))

Results for val - Features Enginering
Accuracy:  0.7586206896551724
Recall:  0.85
Precision:  0.8095238095238095


### Testing

In [None]:
pre_test_2 = lr2.predict(test_new)
print("Results for Test - Features Enginering")
print("Accuracy: ",accuracy_score(y_test, pre_test_2))
print("Recall: ",recall_score(y_test, pre_test_2))
print("Precision: ",precision_score(y_test, pre_test_2))

Results for Test - Features Enginering
Accuracy:  0.6486486486486487
Recall:  0.75
Precision:  0.72


### Compare results 

In [None]:
print("Compare results for val")
compare_val = 1 - (accuracy_score(y_val, pre_val_2)/accuracy_score(y_val, pred_val))
print(compare_val)
print(" ")
print("Compare results for Test")
compare_test = 1 - (accuracy_score(y_test, pre_test_2)/accuracy_score(y_test, pred_test))
print(compare_test)

Compare results for val
-0.04761904761904745
 
Compare results for Test
0.1428571428571429


### Save models

In [None]:
# Used for saving out models
import joblib
import datetime

In [None]:
# Todays date as string
today = str(datetime.datetime.today())[:10].replace("-", "_")

joblib.dump(one_hot, f"./models/one_hot_{today}")
joblib.dump(lr, f"./models/model_1_{today}")
joblib.dump(lr2, f"./models/model_2_{today}")

['./models/model_2_2021_06_21']

In [None]:
jl_one_hot = joblib.load(f"./models/one_hot_{today}")
jl_model_1 = joblib.load(f"./models/model_1_{today}")
jl_model_2 = joblib.load(f"./models/model_2_{today}")