In [1]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Titanic Dataset
# - Load the dataset using Pandas.
# - Check for missing values in the 'Age' column.







# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Rows in Titanic Dataset
# - Identify any duplicate rows in the dataset.










# Part 3: Generate a Data Quality Report

# Task 3: Titanic Dataset Overview
# - Create a simple report of missing values, duplicates, and some basic statistics for the Titanic dataset.

import pandas as pd
import numpy as np

# Part 1: Load Dataset and Check Missing Values
# --------------------------------------------
# Task 1: Load Titanic dataset and check Age column
print("="*50)
print("PART 1: LOADING DATA & MISSING VALUES CHECK")
print("="*50)

# Load the dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(url)

# Check missing values in Age column
age_missing = titanic['Age'].isnull().sum()
age_missing_pct = (age_missing / len(titanic)) * 100

print(f"\nMissing values in 'Age' column: {age_missing} ({age_missing_pct:.1f}%)")

# Part 2: Identify Duplicates and Inconsistencies
# ----------------------------------------------
# Task 2: Check for duplicate rows
print("\n" + "="*50)
print("PART 2: DUPLICATE CHECK")
print("="*50)

# Find complete duplicates
duplicates = titanic.duplicated().sum()
print(f"\nTotal duplicate rows: {duplicates}")

# Find potential partial duplicates (same name and ticket)
partial_dupes = titanic.duplicated(subset=['Name', 'Ticket']).sum()
print(f"Rows with same Name and Ticket: {partial_dupes}")

# Part 3: Generate Data Quality Report
# ----------------------------------
# Task 3: Create comprehensive report
print("\n" + "="*50)
print("PART 3: DATA QUALITY REPORT")
print("="*50)

def generate_titanic_report(df):
    """Generate a data quality report for Titanic dataset"""
    
    # Initialize report
    report = {
        'overview': {
            'Total Passengers': len(df),
            'Total Features': len(df.columns),
            'Duplicate Rows': df.duplicated().sum()
        },
        'missing_values': df.isnull().sum().sort_values(ascending=False),
        'survival_stats': {
            'Overall Survival Rate': f"{df['Survived'].mean()*100:.1f}%",
            'By Class': df.groupby('Pclass')['Survived'].mean(),
            'By Sex': df.groupby('Sex')['Survived'].mean()
        },
        'numeric_stats': df.describe()
    }
    return report

# Generate and display report
report = generate_titanic_report(titanic)

# Print overview
print("\nDATASET OVERVIEW:")
for k, v in report['overview'].items():
    print(f"{k:<20}: {v}")

# Print missing values
print("\nMISSING VALUES:")
print(report['missing_values'][report['missing_values'] > 0])

# Print survival statistics
print("\nSURVIVAL STATISTICS:")
print(f"Overall: {report['survival_stats']['Overall Survival Rate']}")
print("\nBy Passenger Class:")
print(report['survival_stats']['By Class'])
print("\nBy Gender:")
print(report['survival_stats']['By Sex'])

# Print numeric statistics
print("\nNUMERIC FEATURES SUMMARY:")
print(report['numeric_stats'])






PART 1: LOADING DATA & MISSING VALUES CHECK

Missing values in 'Age' column: 177 (19.9%)

PART 2: DUPLICATE CHECK

Total duplicate rows: 0
Rows with same Name and Ticket: 0

PART 3: DATA QUALITY REPORT

DATASET OVERVIEW:
Total Passengers    : 891
Total Features      : 12
Duplicate Rows      : 0

MISSING VALUES:
Cabin       687
Age         177
Embarked      2
dtype: int64

SURVIVAL STATISTICS:
Overall: 38.4%

By Passenger Class:
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

By Gender:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

NUMERIC FEATURES SUMMARY:
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.00