# **Titanic Survival Prediction**

**Import essential libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

**Set visualization style**

In [2]:
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

**Load the datasets**

In [3]:
df = pd.read_csv('/kaggle/input/test-file/tested.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Dataset shape: (418, 12)
Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


**Initial Data Exploration**

In [4]:
# Display first few rows
print("First 5 rows of the dataset:")
display(df.head())

First 5 rows of the dataset:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# Dataset information
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [6]:
# Check for missing values
print("\nMissing Values:")
df.isnull().sum()


Missing Values:


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
# Basic statistics
print("\nStatistical Summary:")
display(df.describe(include='all'))


Statistical Summary:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,,418,2,,,,363,,76,3
top,,,,"Peter, Master. Michael J",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,,1,266,,,,5,,3,270
mean,1100.5,0.363636,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.481622,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,0.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,0.0,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,0.0,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,1.0,3.0,,,39.0,1.0,0.0,,31.5,,


**Data Cleaning & Preprocessing**

In [8]:
# Create a copy for cleaning
df_clean = df.copy()

In [9]:
# Check exact missing values
print("Missing values before cleaning:")
print(df_clean.isnull().sum())

Missing values before cleaning:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [10]:
# Strategy for missing values
# Age: Fill with median based on Pclass and Sex
df_clean['Age'] = df_clean.groupby(['Pclass', 'Sex'])['Age'].transform(
    lambda x: x.fillna(x.median())
)

In [11]:
# Cabin: Extract deck information if available, then fill missing
df_clean['Deck'] = df_clean['Cabin'].str[0] if 'Cabin' in df_clean.columns else None
df_clean['Deck'] = df_clean['Deck'].fillna('Unknown')

In [12]:
df_clean.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Deck             0
dtype: int64

In [13]:
# Fare: Fill single missing values with median of Pclass
df_clean['Fare'] = df_clean.groupby('Pclass')['Fare'].transform(
    lambda x: x.fillna(x.median())
)

In [14]:
df_clean.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Deck             0
dtype: int64

In [15]:
# Drop Cabin column because we have too many missing values
if df_clean['Cabin'].isnull().sum() / len(df_clean) > 0.7:
    df_clean = df_clean.drop('Cabin', axis=1)

In [16]:
print("\nMissing values after cleaning:")
print(df_clean.isnull().sum())


Missing values after cleaning:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Deck           0
dtype: int64


**Feature Engineering**

In [17]:
# Create new features that might be predictive
# Family size
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1

In [18]:
# Is alone
df_clean['IsAlone'] = (df_clean['FamilySize'] == 1).astype(int)

In [19]:
# Age groups
df_clean['AgeGroup'] = pd.cut(df_clean['Age'], 
                              bins=[0, 12, 18, 35, 60, 100],
                              labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])