In [39]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import classification_report, precision_recall_fscore_support,f1_score,recall_score,precision_score
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

- Evaluate your model's performance with cross validation and using different metrics.
- Determine the model with the most appropriate parameters by hyperparameter tuning.

In [2]:
train_df = pd.read_csv("C:/Users/Ahmet/Desktop/data/train.csv")
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # Detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [4]:
train_df.loc[detect_outliers(train_df,["Age","SibSp","Parch","Fare"])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
180,181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S


In [5]:
# drop outliers
train_df = train_df.drop(detect_outliers(train_df,["Age","SibSp","Parch","Fare"]),axis = 0).reset_index(drop = True)

In [6]:
train_df.columns[train_df.isnull().any()]

Index(['Age', 'Cabin', 'Embarked'], dtype='object')

In [7]:
train_df["Embarked"] = train_df["Embarked"].fillna("C")
train_df[train_df["Embarked"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [8]:
index_miss_age = list(train_df["Age"][train_df["Age"].isnull()].index)
for i in index_miss_age:
    age_pred = train_df["Age"][((train_df["SibSp"] == train_df.iloc[i]["SibSp"]) &(train_df["Parch"] == train_df.iloc[i]["Parch"])& (train_df["Pclass"] == train_df.iloc[i]["Pclass"]))].median()
    age_med = train_df["Age"].median()
    if not np.isnan(age_pred):
        train_df["Age"].iloc[i] = age_pred
    else:
        train_df["Age"].iloc[i] = age_med

In [9]:
train_df[train_df["Age"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [10]:
name = train_df["Name"]
train_df["Title"] = [i.split(".")[0].split(",")[-1].strip() for i in name]

In [11]:
# convert to categorical
train_df["Title"] = train_df["Title"].replace(["Lady","the Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],"other")
train_df["Title"] = [0 if i == "Master" else 1 if i == "Miss" or i == "Ms" or i == "Mlle" or i == "Mrs" else 2 if i == "Mr" else 3 for i in train_df["Title"]]
train_df["Title"].head(20)

0     2
1     1
2     1
3     1
4     2
5     2
6     2
7     0
8     1
9     1
10    1
11    1
12    2
13    2
14    1
15    1
16    0
17    2
18    1
19    1
Name: Title, dtype: int64

In [12]:
train_df = pd.get_dummies(train_df,columns=["Title"])

In [13]:
train_df.drop(labels = ["Name"], axis = 1, inplace = True)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_0,Title_1,Title_2,Title_3
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0
1,2,1,1,female,38.0,1,0,PC 17599,71.283,C85,C,0,1,0,0
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,0,0
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,0,1,0,0
4,5,0,3,male,35.0,0,0,373450,8.05,,S,0,0,1,0


In [14]:
train_df["FamSize"] = train_df["SibSp"] + train_df["Parch"] + 1
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_0,Title_1,Title_2,Title_3,FamSize
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0,2
1,2,1,1,female,38.0,1,0,PC 17599,71.283,C85,C,0,1,0,0,2
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,0,0,1
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,0,1,0,0,2
4,5,0,3,male,35.0,0,0,373450,8.05,,S,0,0,1,0,1


In [15]:
train_df["family_size"] = [1 if i < 5 else 0 for i in train_df["FamSize"]]
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_0,Title_1,Title_2,Title_3,FamSize,family_size
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0,2,1
1,2,1,1,female,38.0,1,0,PC 17599,71.283,C85,C,0,1,0,0,2,1
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,0,0,1,1
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,0,1,0,0,2,1
4,5,0,3,male,35.0,0,0,373450,8.05,,S,0,0,1,0,1,1
5,6,0,3,male,26.0,0,0,330877,8.458,,Q,0,0,1,0,1,1
6,7,0,1,male,54.0,0,0,17463,51.862,E46,S,0,0,1,0,1,1
7,8,0,3,male,2.0,3,1,349909,21.075,,S,1,0,0,0,5,0
8,9,1,3,female,27.0,0,2,347742,11.133,,S,0,1,0,0,3,1
9,10,1,2,female,14.0,1,0,237736,30.071,,C,0,1,0,0,2,1


In [16]:
train_df["Pclass"] = train_df["Pclass"].astype("category")
train_df["Sex"] = train_df["Sex"].astype("category")
train_df = pd.get_dummies(train_df, columns= ["family_size","Embarked","Pclass","Sex"])
train_df.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_0,Title_1,...,family_size_0,family_size_1,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,1,0,22.0,1,0,A/5 21171,7.25,,0,0,...,0,1,0,0,1,0,0,1,0,1
1,2,1,38.0,1,0,PC 17599,71.283,C85,0,1,...,0,1,1,0,0,1,0,0,1,0
2,3,1,26.0,0,0,STON/O2. 3101282,7.925,,0,1,...,0,1,0,0,1,0,0,1,1,0
3,4,1,35.0,1,0,113803,53.1,C123,0,1,...,0,1,0,0,1,1,0,0,1,0
4,5,0,35.0,0,0,373450,8.05,,0,0,...,0,1,0,0,1,0,0,1,0,1


In [17]:
train_df.drop(labels = ["PassengerId", "Cabin", "Ticket"], axis = 1, inplace = True)
train_df.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Title_0', 'Title_1',
       'Title_2', 'Title_3', 'FamSize', 'family_size_0', 'family_size_1',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Sex_female', 'Sex_male'],
      dtype='object')

In [19]:
X = train_df.drop(["Survived"],axis=1)
y = train_df["Survived"]

In [35]:
log_reg_model = LogisticRegression()

cv = cross_validate(estimator=log_reg_model,
                    X=X,
                    y=y,
                    cv=10,
                    return_train_score=True,
                  
                   )

print('Train Scores :', cv['train_score'], sep = '\n')
print("-"*50)
print('Test Scores     :', cv['test_score'], sep = '\n')
print("-"*50)
print('Mean score of train set : ', cv['train_score'].mean())
print('Mean score of test set  : ', cv['test_score'].mean())


Train Scores :
[0.82449495 0.8259773  0.83228247 0.8184111  0.82849937 0.82723834
 0.82723834 0.83480454 0.8221942  0.82345523]
--------------------------------------------------
Test Scores     :
[0.83146067 0.85227273 0.75       0.88636364 0.80681818 0.76136364
 0.81818182 0.80681818 0.86363636 0.84090909]
--------------------------------------------------
Mean score of train set :  0.8264595832218783
Mean score of test set  :  0.821782431052094


In [43]:
cv = cross_validate(estimator=log_reg_model,
                    X=X,
                    y=y,
                    cv=10,
                    return_train_score=True,
                    scoring = ['accuracy', 'precision', 'r2']
                   )
print('Train Set Mean Accuracy  : {:.2f}  '.format(cv['train_accuracy'].mean()))
print('Train Set Mean R-square  : {:.2f}  '.format(cv['train_r2'].mean()))
print('Train Set Mean Precision : {:.2f}\n'.format(cv['train_precision'].mean()))

print('Test Set Mean Accuracy   : {:.2f}  '.format(cv['test_accuracy'].mean()))
print('Test Set Mean R-square   : {:.2f}  '.format(cv['test_r2'].mean()))
print('Test Set Mean Precision  : {:.2f}  '.format(cv['test_precision'].mean()))

Train Set Mean Accuracy  : 0.83  
Train Set Mean R-square  : 0.27  
Train Set Mean Precision : 0.79

Test Set Mean Accuracy   : 0.82  
Test Set Mean R-square   : 0.25  
Test Set Mean Precision  : 0.79  


- Our model works well, but when we look at the test scores, we see that some values ​​are far from the mean. We can also say that our R-Square values ​​are low. This may be because in logistic regression we try to predict only 0 and 1 values, so some data points are too far from our function.

In [42]:
print(log_reg_model.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [44]:
parameters = {"C": [10 ** x for x in range (-5, 5, 1)],
              "penalty": ['l1', 'l2', 'elasticnet'],
              
             }

In [53]:
rs_cv = RandomizedSearchCV(estimator=log_reg_model,
                           param_distributions = parameters,
                           cv = 10,
                           n_iter = 10,
                           random_state = 111,
                           scoring = 'accuracy'
                      )

rs_cv.fit(X, y)

RandomizedSearchCV(cv=10, estimator=LogisticRegression(),
                   param_distributions={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1,
                                              1, 10, 100, 1000, 10000],
                                        'penalty': ['l1', 'l2', 'elasticnet']},
                   random_state=111, scoring='accuracy')

In [55]:
print("Best parameters      : ", rs_cv.best_params_, "\n")
print("All accuracy values : ", rs_cv.cv_results_['mean_test_score'], "\n")
print("Best accuracy value : ", rs_cv.best_score_)

Best parameters      :  {'penalty': 'l2', 'C': 1} 

All accuracy values :  [       nan        nan        nan        nan        nan        nan
 0.67088866 0.82178243 0.81497702        nan] 

Best accuracy value :  0.821782431052094


- With the hyperparameter tuning we did with RandomizedSearchCV, the best parameters are as above.