In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df.head()

In [None]:
f=df.drop(["Ticket","Cabin","Name"],axis=1)
test=test.drop(["Ticket","Cabin","Name"],axis=1)


In [None]:
test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,male,34.5,0,0,7.8292
1,893,3,female,47.0,1,0,7.0000
2,894,2,male,62.0,0,0,9.6875
3,895,3,male,27.0,0,0,8.6625
4,896,3,female,22.0,1,1,12.2875
...,...,...,...,...,...,...,...
413,1305,3,male,,0,0,8.0500
414,1306,1,female,39.0,0,0,108.9000
415,1307,3,male,38.5,0,0,7.2500
416,1308,3,male,,0,0,8.0500


In [None]:
average_age_df=df['Age'].mean()
average_age_test=test['Age'].mean()
df['Age']=df['Age'].fillna(average_age_df).astype(int)
test['Age']=test['Age'].fillna(average_age_test).astype(int)

In [None]:
df['Fare']=df['Fare'].astype(int)
test['Fare'].fillna(df['Fare'].mean(), inplace=True)
mapping = {'male': 1, 'female': 0}
df['Sex']=df['Sex'].map(mapping).fillna(1).astype(int)
test['Sex']=test['Sex'].map(mapping).fillna(1).astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Fare'].fillna(df['Fare'].mean(), inplace=True)


In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22,1,0,7
1,2,1,1,0,38,1,0,71
2,3,1,3,0,26,0,0,7
3,4,1,1,0,35,1,0,53
4,5,0,3,1,35,0,0,8


In [None]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,1,34,0,0,7.8292
1,893,3,0,47,1,0,7.0
2,894,2,1,62,0,0,9.6875
3,895,3,1,27,0,0,8.6625
4,896,3,0,22,1,1,12.2875


In [None]:
df=pd.get_dummies(df,columns=['Pclass'])
test=pd.get_dummies(test,columns=['Pclass'])
df['Pclass_1']=df['Pclass_1'].astype(int)
df['Pclass_2']=df['Pclass_2'].astype(int)
df['Pclass_3']=df['Pclass_3'].astype(int)
test['Pclass_1']=test['Pclass_1'].astype(int)
test['Pclass_2']=test['Pclass_2'].astype(int)
test['Pclass_3']=test['Pclass_3'].astype(int)

In [None]:
df=pd.get_dummies(df,columns=['Embarked'])
test=pd.get_dummies(test,columns=['Embarked'])
df['Embarked_S']=df['Embarked_S'].astype(int)
df['Embarked_C']=df['Embarked_C'].astype(int)
df['Embarked_Q']=df['Embarked_Q'].astype(int)
test['Embarked_S']=test['Embarked_S'].astype(int)
test['Embarked_C']=test['Embarked_C'].astype(int)
test['Embarked_Q']=test['Embarked_Q'].astype(int)

In [None]:
test.isnull().sum()

PassengerId    0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Pclass_1       0
Pclass_2       0
Pclass_3       0
dtype: int64

In [None]:
df.dtypes

PassengerId    int64
Survived       int64
Sex            int64
Age            int64
SibSp          int64
Parch          int64
Fare           int64
Pclass_1       int64
Pclass_2       int64
Pclass_3       int64
dtype: object

In [None]:
X = df.drop(['Survived'],axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
model = RandomForestClassifier()
param_dist = {'n_estimators': randint(50,1000),
              'max_depth': randint(1,40)}
best_model=RandomizedSearchCV(model,param_distributions=param_dist,n_iter=5,cv=5)
best_model.fit(X_train,y_train)

In [None]:
predictions = best_model.predict(X_test)
predictions

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0])

In [None]:

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       110
           1       0.82      0.67      0.74        69

    accuracy                           0.82       179
   macro avg       0.82      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179



In [None]:
result = best_model.predict(test)

In [None]:
test['Survived'] = result
submission = test[['PassengerId', 'Survived']]
submission.to_csv("final_submission.csv", index=False)
