# Best Model Selection

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [44]:
df['sex']=df['sex'].map({'male':0,'female':1})
# fill age column with mean
df['age'].fillna(df['age'].mean(),inplace=True)
# Split into X and y
X=df[["pclass","age","fare","sex","parch","sibsp"]]
y=df["survived"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(),inplace=True)


In [45]:
X.isnull().sum()

pclass    0
age       0
fare      0
sex       0
parch     0
sibsp     0
dtype: int64

In [46]:
X.head()

Unnamed: 0,pclass,age,fare,sex,parch,sibsp
0,3,22.0,7.25,0,0,1
1,1,38.0,71.2833,1,0,1
2,3,26.0,7.925,1,0,0
3,1,35.0,53.1,1,0,1
4,3,35.0,8.05,0,0,0


In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score,recall_score
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

models=[LogisticRegression(),
        SVC(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        KNeighborsClassifier()]

# make list for naming during print 

models_name=["LogisticRegression",
             "SVC",
             "DecisionTreeClassifier",
             "RandomForestClassifier",
             "KNeighborsClassifier"] # here values must be same as models list
# Blank list 
models_score=[]
for model,models_name in zip(models,models_name):
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    models_score.append([models_name,accuracy,f1,precision,recall])
models_score=pd.DataFrame(models_score,columns=["Model","Accuracy","F1","Precision","Recall"]) # convert to dataframe
models_score.sort_values(by="Accuracy",ascending=False,inplace=True) # sort by accuracy
models_score.reset_index(drop=True,inplace=True) # reset index
models_score

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,LogisticRegression,0.810056,0.757143,0.80303,0.716216
1,RandomForestClassifier,0.793296,0.737589,0.776119,0.702703
2,DecisionTreeClassifier,0.765363,0.712329,0.722222,0.702703
3,KNeighborsClassifier,0.698324,0.597015,0.666667,0.540541
4,SVC,0.653631,0.367347,0.75,0.243243


Whenever i said that this model is best so it will be on its performance here logistic refegression model is perfom well because its accuracy is 0.81,precision,f1 and recall score