In [1]:
# Title and Introduction
# Name: Austin Githinji
# Title: EDA
# CyberShujaa ID:CS-EH03-25417


In [2]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure display options
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid', palette='muted', font_scale=1.1)

In [3]:
# Load and inspect the dataset
df = pd.read_csv('/kaggle/input/titanic/train.csv')

# Display first five rows
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Basic dataset information
print("Dataset Shape:", df.shape)
print("\nColumn Names:\n", df.columns)
print("\nData Types and Non-Null Counts:")
print(df.info())

Dataset Shape: (891, 12)

Column Names:
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Data Types and Non-Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [5]:
# Descriptive statistics for numerical columns
df.describe(include=[np.number])

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# Summary of categorical columns
df.describe(include=[object])


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [7]:
# Check for missing values in each column
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage (%)': missing_percent})
missing_data.sort_values(by='Missing Values', ascending=False)

Unnamed: 0,Missing Values,Percentage (%)
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0


In [8]:
# Drop Cabin column if it exists
df.drop(columns=['Cabin'], inplace=True, errors='ignore')

# Fill missing 'Age' values with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing 'Embarked' values with mode (most frequent value)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Check again for missing values
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [None]:
# Visualizing outliers in numerical columns
plt.figure(figsize=(12, 6))

for i, col in enumerate(['Age', 'Fare']):
    plt.subplot(1, 2, i+1)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()

In [None]:
# Using IQR (Interquartile Range) to detect and cap outliers in 'Fare'
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Capping the outliers
df['Fare'] = np.where(df['Fare'] > upper_bound, upper_bound,
                      np.where(df['Fare'] < lower_bound, lower_bound, df['Fare']))

In [None]:
# List categorical columns
categorical_cols = ['Sex', 'Pclass', 'Embarked']

# Summary of categorical features
for col in categorical_cols:
    print(f"Value counts for {col}:\n")
    print(df[col].value_counts())
    print("\n-----------------------------------\n")

In [None]:
# Visualizing distributions of categorical variables
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
sns.countplot(x='Sex', data=df)
plt.title('Gender Distribution')

plt.subplot(1, 3, 2)
sns.countplot(x='Pclass', data=df)
plt.title('Passenger Class Distribution')

plt.subplot(1, 3, 3)
sns.countplot(x='Embarked', data=df)
plt.title('Port of Embarkation Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Descriptive statistics for numerical features
df[['Age', 'Fare']].describe()

In [None]:
# Clean numeric columns to replace inf values with NaN before plotting
df[['Age', 'Fare']] = df[['Age', 'Fare']].replace([np.inf, -np.inf], np.nan)

# Visualizing numerical features with histograms
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sns.histplot(df['Age'].dropna(), kde=True, bins=30)
plt.title('Age Distribution')

plt.subplot(1, 2, 2)
sns.histplot(df['Fare'].dropna(), kde=True, bins=30)
plt.title('Fare Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numerical features
corr_matrix = df.corr(numeric_only=True)

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
# Survival rate by Passenger Class, Gender, and Embarkation
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.barplot(x='Pclass', y='Survived', data=df)
plt.title('Survival Rate by Passenger Class')

plt.subplot(1, 3, 2)
sns.barplot(x='Sex', y='Survived', data=df)
plt.title('Survival Rate by Gender')

plt.subplot(1, 3, 3)
sns.barplot(x='Embarked', y='Survived', data=df)
plt.title('Survival Rate by Embarkation Port')

plt.tight_layout()
plt.show()

In [None]:
# Visualizing distributions of numerical features by survival
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sns.boxplot(x='Survived', y='Age', data=df)
plt.title('Age vs Survival')

plt.subplot(1, 2, 2)
sns.boxplot(x='Survived', y='Fare', data=df)
plt.title('Fare vs Survival')

plt.tight_layout()
plt.show()

In [None]:
# Group by multiple features to analyze survival rates
multi_group = df.groupby(['Pclass', 'Sex'])['Survived'].mean().unstack()

# Visualize survival rates by class and gender
multi_group.plot(kind='bar', figsize=(8,5))
plt.title('Survival Rate by Class and Gender')
plt.ylabel('Average Survival Rate')
plt.xlabel('Passenger Class')
plt.xticks(rotation=0)
plt.legend(title='Sex')
plt.show()

In [None]:
# Visualizing combined effects using scatter plot
plt.figure(figsize=(8,6))
sns.scatterplot(x='Age', y='Fare', hue='Survived', style='Pclass', data=df, palette='coolwarm')
plt.title('Age, Fare, and Survival by Passenger Class')
plt.show()

In [None]:
# Crosstab analysis for Embarked, Class, and Survival
crosstab = pd.crosstab([df['Embarked'], df['Pclass']], df['Survived'])
sns.heatmap(crosstab, annot=True, cmap='Blues', fmt='d')
plt.title('Survival Counts by Embarkation Port and Passenger Class')
plt.show()

In [None]:
# Count of survival vs non-survival
sns.countplot(x='Survived', data=df)
plt.title('Distribution of Survival')
plt.xticks([0, 1], ['Not Survived (0)', 'Survived (1)'])
plt.show()

# Calculate survival percentages
survival_rate = df['Survived'].mean() * 100
print(f"Overall survival rate: {survival_rate:.2f}%")

In [None]:
# Combined plots to explore interaction effects
plt.figure(figsize=(12,6))

plt.subplot(1, 2, 1)
sns.barplot(x='Sex', y='Survived', hue='Pclass', data=df)
plt.title('Survival by Gender and Class')

plt.subplot(1, 2, 2)
sns.barplot(x='Embarked', y='Survived', hue='Sex', data=df)
plt.title('Survival by Embarkation and Gender')

plt.tight_layout()
plt.show()

In [None]:
# Verify there are no missing values
print("Missing values after cleaning:\n", df.isnull().sum())

# Confirm data types are correct
print("\nData types:\n", df.dtypes)

# Ensure no duplicates remain
print("\nDuplicate rows:", df.duplicated().sum())

# Check for logical consistency (release year-like check equivalent)
print("\nAge range:", df['Age'].min(), "-", df['Age'].max())
print("Fare range:", df['Fare'].min(), "-", df['Fare'].max())

In [11]:
# Reset index and export cleaned dataset
df_cleaned = df.reset_index(drop=True)
df_cleaned.to_csv('/kaggle/working/cleaned_titanic.csv', index=False)

print("✅ Cleaned Titanic dataset saved successfully.")

✅ Cleaned Titanic dataset saved successfully.
