In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [3]:
test_df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
# Print the first few rows of the dataset
print(train_df.head())

# Check the number of rows and columns in the dataset
print(train_df.shape)

# Check the data types of the columns
print(train_df.info())

# Check the summary statistics of the numeric columns
print(train_df.describe())

# Check the missing values in the dataset
print(train_df.isnull().sum())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [5]:
# Fill missing values in the Age column with the median age
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

# Fill missing values in the Embarked column with the mode value
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Drop the Cabin column since it has too many missing values
train_df.drop('Cabin', axis=1, inplace=True)

# Create a new feature FamilySize by combining SibSp and Parch columns
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# Create a new feature IsAlone to indicate if the passenger is traveling alone
train_df['IsAlone'] = np.where(train_df['FamilySize'] == 1, 1, 0)

# Encode categorical variables using one-hot encoding
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'])

# Drop irrelevant columns
train_df.drop(['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch'], axis=1, inplace=True)

# Split the dataset into features and target
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']


In [6]:
test_df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
# Split the dataset into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Train and evaluate the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
acc_logreg = accuracy_score(y_val, y_pred)
print("Logistic Regression accuracy:", acc_logreg)

# Train and evaluate the decision tree model
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_val)
acc_dt = accuracy_score(y_val, y_pred)
print("Decision Tree accuracy:", acc_dt)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
acc_rf = accuracy_score(y_val, y_pred)
print("Random Forest accuracy:", acc_rf)

from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
acc_svc = accuracy_score(y_val, y_pred)
print("Support Vector Machine accuracy:", acc_svc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression accuracy: 0.7932960893854749
Decision Tree accuracy: 0.770949720670391
Random Forest accuracy: 0.8156424581005587
Support Vector Machine accuracy: 0.659217877094972


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [5, 10, 15],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Train and evaluate the random forest model with the best hyperparameters
rf = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                             max_depth=best_params['max_depth'],
                             min_samples_split=best_params['min_samples_split'],
                             min_samples_leaf=best_params['min_samples_leaf'])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
acc_rf = accuracy_score(y_val, y_pred)
print("Random Forest accuracy:", acc_rf)

Best parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Random Forest accuracy: 0.8044692737430168


In [10]:
test_df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [11]:
# Preprocess the test set in the same way as the training set
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['IsAlone'] = np.where(test_df['FamilySize'] == 1, 1, 0)
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'])
test_df.drop(['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch', 'Cabin'], axis=1, inplace=True)

# Make predictions on the test set using the random forest model with the best hyperparameters
test_pred = rf.predict(test_df)

# Create a submission file in the correct format for Kaggle
submission_df = pd.read_csv('gender_submission.csv')
submission_df['Survived'] = test_pred
submission_df.to_csv('submission.csv', index=False)