In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df['family'] = df.SibSp + df.Parch

In [None]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1


In [None]:
df = df.drop(columns = ['SibSp','Parch','Cabin','Name','PassengerId','Ticket'])

In [None]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,family
0,0,3,male,22.0,7.25,S,1
1,1,1,female,38.0,71.2833,C,1
2,1,3,female,26.0,7.925,S,0
3,1,1,female,35.0,53.1,S,1
4,0,3,male,35.0,8.05,S,0


In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df.Survived,test_size = 0.2,random_state=42)

In [None]:
x_train['family'].isnull().sum()

0

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [None]:
num_features = ['Age','Fare']
num_pipeline = Pipeline(
    steps=[
        ('KNNImputer',KNNImputer(n_neighbors=5, weights='distance')),
        ('StandardScaler',StandardScaler())
    ]
)

In [None]:
cat_features = ['Embarked', 'Sex']
cat_pipeline = Pipeline(steps=[
    ('SimpleImputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_pipeline,num_features),
        ('cat',cat_pipeline,cat_features)
    ]
)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('classifier',LogisticRegression(max_iter=1000, random_state=42))
    ]
)

In [None]:
from sklearn import set_config

set_config(display='diagram')
clf

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor__num__KNNImputer__n_neighbors': [3, 5],
    'preprocessor__cat__SimpleImputer__strategy': ['most_frequent', 'constant'],
    'classifier__C': [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search.fit(x_train, y_train)

In [None]:
print(f"Best params:")
print(grid_search.best_params_)

Best params:
{'classifier__C': 0.1, 'preprocessor__cat__SimpleImputer__strategy': 'most_frequent', 'preprocessor__num__KNNImputer__n_neighbors': 3}


In [None]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.784


In [None]:
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_pred,y_test)*100
score

77.6536312849162