In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
# Load the data
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [4]:
# Combining both train and test data sets to perform feature engineering and then divide them later again.
# This ensures consistency in the number of columns in both datasets after one-hot encoding and other operations.
combined_data = [train_data, test_data]

In [5]:
### 1. Handling Missing Data:

# Filling missing Age values with the median of the Age values.
for dataset in combined_data:
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)

# Filling missing Embarked values with the most frequent value.
for dataset in combined_data:
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)

# Filling missing Fare values with the median of the Fare values.
for dataset in combined_data:
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)

# Handling Cabin data: Create a new feature 'HasCabin' that shows if someone had a Cabin or not.
for dataset in combined_data:
    dataset['HasCabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)


In [6]:

### 2. Feature Transformation and Creation:

# Extract titles from Name and create a new feature 'Title'.
for dataset in combined_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Group all non-common titles into one single grouping "Rare".
for dataset in combined_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Convert categorical features to numeric type.
for dataset in combined_data:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)
    dataset['Title'] = dataset['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).astype(int)
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

# It's a good idea to convert 'Fare' from float to int type.
for dataset in combined_data:
    dataset['Fare'] = dataset['Fare'].astype(int)

# You can also create a new feature 'FamilySize' combining Parch and SibSp.
for dataset in combined_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# Create 'IsAlone' feature based on 'FamilySize'.
for dataset in combined_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [7]:
test_passenger_id = test_data["PassengerId"].copy()  # Add this line before dropping columns
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
train_data = train_data.drop(drop_elements, axis=1)
test_data = test_data.drop(drop_elements, axis=1)

In [8]:
print(train_data.head())

   Survived  Pclass  Sex   Age  Fare  Embarked  HasCabin  Title  FamilySize  \
0         0       3    0  22.0     7         0         0      1           2   
1         1       1    1  38.0    71         1         1      3           2   
2         1       3    1  26.0     7         0         0      2           1   
3         1       1    1  35.0    53         0         1      3           2   
4         0       3    0  35.0     8         0         0      1           1   

   IsAlone  
0        0  
1        0  
2        1  
3        0  
4        1  


In [9]:
from sklearn.svm import SVC
# Splitting the dataset into training and testing sets
X_train = train_data.drop("Survived", axis=1)
y_train = train_data["Survived"]
X_test = test_data  # We don't have a y_test since that's what you'd predict for the Kaggle submission.

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Using SVM
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)

# Check accuracy on validation set
acc_svm = accuracy_score(y_val, y_pred)
print(f"SVM Accuracy: {acc_svm}")

# For submission to Kaggle
submission_predictions = svm.predict(X_test)

SVM Accuracy: 0.7988826815642458


In [10]:
submission = pd.DataFrame({
    "PassengerId": test_passenger_id,  # Use the stored passenger IDs here
    "Survived": submission_predictions
})
submission.to_csv('submission.csv', index=False)