In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


# Check for missing values

In [4]:
missing_values = train_data.isnull().sum()
print(missing_values)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
# Impute missing values for Age with the median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Impute missing values for Embarked with the mode
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Drop the Cabin column due to too many missing values
train_data.drop(columns=['Cabin'], inplace=True)

In [6]:
missing_values_after = train_data.isnull().sum()
print(missing_values_after)

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [7]:
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler()

In [11]:
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])

In [12]:
print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex       Age  \
0                            Braund, Mr. Owen Harris    male -0.565736   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  0.663861   
2                             Heikkinen, Miss. Laina  female -0.258337   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  0.433312   
4                           Allen, Mr. William Henry    male  0.433312   

      SibSp     Parch            Ticket      Fare Embarked  
0  0.432793 -0.473674         A/5 21171 -0.502445        S  
1  0.432793 -0.473674          PC 17599  0.786845        C  
2 -0.474545 -0.473674  STON/O2. 3101282 -0.488854        S  
3  0.432793 -0.473674            113803  0.420730        S  
4 -0.474545 -0.473674            

In [13]:
duplicate_rows = train_data.duplicated()
print("Number of duplicate rows: ", duplicate_rows.sum())

Number of duplicate rows:  0


In [15]:
# One-hot encode the categorical features
train_data_encoded = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])

In [16]:
print(train_data_encoded.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name       Age     SibSp  \
0                            Braund, Mr. Owen Harris -0.565736  0.432793   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  0.663861  0.432793   
2                             Heikkinen, Miss. Laina -0.258337 -0.474545   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  0.433312  0.432793   
4                           Allen, Mr. William Henry  0.433312 -0.474545   

      Parch            Ticket      Fare  Sex_female  Sex_male  Embarked_C  \
0 -0.473674         A/5 21171 -0.502445           0         1           0   
1 -0.473674          PC 17599  0.786845           1         0           1   
2 -0.473674  STON/O2. 3101282 -0.488854           1         0           0   
3 -0.473674       

In [17]:
from sklearn.model_selection import train_test_split

In [19]:
X = train_data_encoded.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket'])
y = train_data_encoded['Survived']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression

In [23]:
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

In [24]:
lr = LogisticRegression()

In [25]:
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy')

fit the model

In [26]:
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy')

In [28]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of X_train: (712, 10)
Shape of y_train: (712,)


In [35]:
# Fit the model
try:
    grid_search_lr.fit(X_train, y_train)
    best_params_lr = grid_search_lr.best_params_
    print(f"Best parameters for Logistic Regression: {best_params_lr}")

    # Predict on the test set
    y_pred_lr = grid_search_lr.predict(X_test)

    # Evaluate the model
    precision_lr = precision_score(y_test, y_pred_lr)
    recall_lr = recall_score(y_test, y_pred_lr)
    print(f"Logistic Regression - Precision: {precision_lr}, Recall: {recall_lr}")
except Exception as e:
    print("Error during model fitting or evaluation:", e)


Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'lbfgs'}
Logistic Regression - Precision: 0.7714285714285715, Recall: 0.7297297297297297


In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

In [38]:
rf = RandomForestClassifier(random_state=42)

In [39]:
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')

In [41]:
# Fit the model
grid_search_rf.fit(X_train, y_train)

# Best parameters
best_params_rf = grid_search_rf.best_params_
print(f"Best parameters for Random Forest: {best_params_rf}")

# Predict on the test set
y_pred_rf = grid_search_rf.predict(X_test)

# Evaluate the model
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
print(f"Random Forest - Precision: {precision_rf}, Recall: {recall_rf}")

Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Random Forest - Precision: 0.8412698412698413, Recall: 0.7162162162162162


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# support Vector machine(svm)

In [44]:
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

In [45]:
svm = SVC()
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)
best_params_svm = grid_search_svm.best_params_
print(f"Best parameters for SVM: {best_params_svm}")
y_pred_svm = grid_search_svm.predict(X_test)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
print(f"SVM - Precision: {precision_svm}, Recall: {recall_svm}")

Best parameters for SVM: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
SVM - Precision: 0.8059701492537313, Recall: 0.7297297297297297


# Gradient Boosting Classifier

In [46]:
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 10]
}

In [47]:
gb = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)
best_params_gb = grid_search_gb.best_params_
print(f"Best parameters for Gradient Boosting: {best_params_gb}")
y_pred_gb = grid_search_gb.predict(X_test)
precision_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)
print(f"Gradient Boosting - Precision: {precision_gb}, Recall: {recall_gb}")

Best parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}
Gradient Boosting - Precision: 0.8666666666666667, Recall: 0.7027027027027027


In [49]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# Cross-Validation for Logistic Regression

In [50]:
lr_cv_scores = cross_val_score(grid_search_lr, X, y, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy for Logistic Regression: {lr_cv_scores.mean()}")

Cross-validated Accuracy for Logistic Regression: 0.78788525516289


# Cross-Validation for  Random Forest

In [51]:
rf_cv_scores = cross_val_score(grid_search_rf, X, y, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy for Random Forest: {rf_cv_scores.mean()}")

Cross-validated Accuracy for Random Forest: 0.8283033080158182


# Cross-Validation for svm

In [52]:
svm_cv_scores = cross_val_score(grid_search_svm, X, y, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy for SVM: {svm_cv_scores.mean()}")

Cross-validated Accuracy for SVM: 0.8204130311970372


# Cross-Validation for Gradient Boosting

In [53]:
gb_cv_scores = cross_val_score(grid_search_gb, X, y, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy for Gradient Boosting: {gb_cv_scores.mean()}")

Cross-validated Accuracy for Gradient Boosting: 0.8148389931579938


In [54]:
from sklearn.pipeline import Pipeline

In [55]:
preprocessor = StandardScaler()

In [56]:
# Define the model
gb = GradientBoostingClassifier(random_state=42)

# Pipeline

In [57]:
# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', gb)
])

In [58]:
# Define hyperparameters to tune
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.5],
    'model__max_depth': [3, 5, 10]
}

In [59]:
# Perform GridSearchCV with the pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [60]:
# Best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Best parameters: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__n_estimators': 200}


In [61]:
# Evaluate the model
y_pred = grid_search.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Precision: {precision}, Recall: {recall}")

Precision: 0.8666666666666667, Recall: 0.7027027027027027


I chose Gradint Boosting becasue in the case of Titranic dataset it is very important to know the proportion of flase positives. So gradient boosting is very effective at minimizing fasle positives because of its high precision. Here even though Random Forest has a higher recall, it doesn't matter that much since Gradient Boosting had a reasonble recall. |