# Table of Contents:
# 1. Introduction to Pandas
# 2. Creating Series and DataFrames
# 3. Loading and Exploring Data
# 4. Data Selection and Indexing
# 5. Data Cleaning and Handling Missing Values
# 6. Data Filtering and Querying
# 7. Data Aggregation and Grouping
# 8. Data Merging and Joining
# 9. Pivot Tables and Reshaping
# 10. Advanced Operations

In [2]:
import pandas as pd
import numpy as np
print("Pandas version:", pd.__version__)

Pandas version: 2.3.2


# ===========================================================
# 1. INTRODUCTION TO PANDAS
# ===========================================================


Pandas is a powerful data manipulation and analysis library for Python.
It provides two main data structures:
- Series: 1-dimensional labeled array
- DataFrame: 2-dimensional labeled data structure (like a spreadsheet)

Key Features:
- Easy handling of missing data
- Data alignment and merging
- Flexible reshaping and pivoting
- Powerful grouping functionality
- Time series functionality


# =================================================================
# 2. CREATING SERIES AND DATAFRAMES
# =================================================================

In [3]:
print("="*60)
print("2. CREATING SERIES AND DATAFRAMES")
print("="*60)

# Creating a Series
print("\n2.1 Creating Series:")
print("-" * 30)

# From a list
ages = pd.Series([25, 30, 35, 28, 45], name='Age')
print("Series from list:")
print(ages)

# From a dictionary
person_ages = pd.Series({'Alice': 25, 'Bob': 30, 'Charlie': 35, 'Diana': 28})
print("\nSeries from dictionary:")
print(person_ages)

# Series attributes
print(f"\nSeries shape: {ages.shape}")
print(f"Series dtype: {ages.dtype}")
print(f"Series index: {ages.index.tolist()}")
print(f"Series values: {ages.values}")

2. CREATING SERIES AND DATAFRAMES

2.1 Creating Series:
------------------------------
Series from list:
0    25
1    30
2    35
3    28
4    45
Name: Age, dtype: int64

Series from dictionary:
Alice      25
Bob        30
Charlie    35
Diana      28
dtype: int64

Series shape: (5,)
Series dtype: int64
Series index: [0, 1, 2, 3, 4]
Series values: [25 30 35 28 45]


In [4]:
# Creating a DataFrame
print("\n2.2 Creating DataFrames:")
print("-" * 30)

# From a dictionary
data_dict = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 45],
    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],
    'Salary': [50000, 60000, 75000, 55000, 80000]
}

df_sample = pd.DataFrame(data_dict)
print("DataFrame from dictionary:")
print(df_sample)

# From lists
names = ['Alice', 'Bob', 'Charlie']
ages = [25, 30, 35]
df_from_lists = pd.DataFrame({'Name': names, 'Age': ages})
print("\nDataFrame from lists:")
print(df_from_lists)

# DataFrame attributes
print(f"\nDataFrame shape: {df_sample.shape}")
print(f"DataFrame columns: {df_sample.columns.tolist()}")
print(f"DataFrame index: {df_sample.index.tolist()}")
print(f"DataFrame dtypes:\n{df_sample.dtypes}")


2.2 Creating DataFrames:
------------------------------
DataFrame from dictionary:
      Name  Age      City  Salary
0    Alice   25  New York   50000
1      Bob   30    London   60000
2  Charlie   35     Paris   75000
3    Diana   28     Tokyo   55000
4      Eve   45    Sydney   80000

DataFrame from lists:
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35

DataFrame shape: (5, 4)
DataFrame columns: ['Name', 'Age', 'City', 'Salary']
DataFrame index: [0, 1, 2, 3, 4]
DataFrame dtypes:
Name      object
Age        int64
City      object
Salary     int64
dtype: object


# =======================================
# 3. LOADING AND EXPLORING DATA
# =======================================



In [5]:
# Load the Titanic dataset
titanic=pd.read_csv('/content/Titanic-Dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/Titanic-Dataset.csv'

In [None]:
titanic

In [None]:
# Basic information about the dataset
print(f"Dataset shape: {titanic.shape}")
print(f"Number of rows: {len(titanic)}")
print(f"Number of columns: {len(titanic.columns)}")

In [None]:
# Display first few rows
print("\nFirst 5 rows:")
titanic.head()

In [None]:
# Display last few rows
print("\nLast 5 rows:")
titanic.tail()


In [None]:
# Display random sample
print("\nRandom sample of 3 rows:")
titanic.sample(3)


In [None]:
# Dataset info
print(titanic.info())

In [None]:
# Describe numerical columns
titanic.describe()

In [None]:
# Check for missing values
print(titanic.isnull().sum())

In [None]:
# Check data types
print(titanic.dtypes)

# =====================================================
# 4. DATA SELECTION AND INDEXING
# =====================================================


In [None]:
print("4. DATA SELECTION AND INDEXING")
print("="*60)

# Column selection
print("\n4.1 Column Selection:")
print("-" * 30)

# Single column (returns Series)
ages_series = titanic['Age']
print(f"Type of single column selection: {type(ages_series)}")
print(f"First 5 ages: {ages_series.head().tolist()}")

In [None]:
# Multiple columns (returns DataFrame)
passenger_info = titanic[['Age', 'Sex', 'Fare']]
print(f"\nType of multiple column selection: {type(passenger_info)}")
print("First 3 rows of passenger info:")
print(passenger_info.head(3))


In [None]:
# Row selection using iloc (integer-location based)
print("\n4.2 Row Selection with iloc:")
print("-" * 30)

# Single row
first_passenger = titanic.iloc[0]
print(f"First passenger:\n{first_passenger}")
print(f"\nType of row selection: {type(first_passenger)}")


In [None]:
# Multiple rows
first_five = titanic.iloc[0:5]
print(f"\nFirst 5 passengers (shape: {first_five.shape}):")
print(first_five)
print(first_five[['Age', 'Sex', 'Fare']])

In [None]:
# Specific rows
specific_rows = titanic.iloc[[0, 5, 10]]
print(f"\nSpecific rows (0, 5, 10):")
print(specific_rows[['Age', 'Sex', 'Fare']])

# Row and column selection
print("\n4.3 Row and Column Selection:")
print("-" * 30)

# Select specific rows and columns
subset = titanic.iloc[0:5, 1:4]  # First 5 rows, columns 1-3
print("Subset (first 5 rows, columns 1-3):")
print(subset)

In [None]:
# Using column names with loc
age_sex_subset = titanic.loc[0:4, ['Age', 'Sex']]
print(f"\nUsing loc with column names:")
print(age_sex_subset)

# =====================================================
# 5. DATA CLEANING AND HANDLING MISSING VALUES
# =====================================================

In [None]:
print("\n" + "="*60)
print("5. DATA CLEANING AND HANDLING MISSING VALUES")
print("="*60)

print("\n5.1 Identifying Missing Values:")
print("-" * 30)

# Check for missing values
missing_counts = titanic.isnull().sum()
print("Missing values per column:")
print(missing_counts[missing_counts > 0])

In [None]:
# Percentage of missing values
missing_percentage = (titanic.isnull().sum() / len(titanic)) * 100
print("\nPercentage of missing values:")
print(missing_percentage[missing_percentage > 0])

In [None]:
print("\n5.2 Handling Missing Values:")
print("-" * 30)

# Create a copy for cleaning
titanic_clean = titanic.copy()

# Method 1: Drop rows with missing values
print(f"Original shape: {titanic_clean.shape}")
titanic_no_na = titanic_clean.dropna()
print(f"After dropping all NAs: {titanic_no_na.shape}")

In [None]:
# Method 2: Drop specific columns with missing values
titanic_drop_cols = titanic_clean.dropna(axis=1)
print(f"After dropping columns with NAs: {titanic_drop_cols.shape}")

In [None]:
# Method 3: Fill missing values
# Fill numerical missing values with median
if 'Age' in titanic_clean.columns:
    median_age = titanic_clean['Age'].median()
    titanic_clean['Age'].fillna(median_age, inplace=True)
    print(f"Filled missing ages with median: {median_age:.1f}")

In [None]:
# Fill categorical missing values with mode
if 'Embarked' in titanic_clean.columns:
    mode_embarked = titanic_clean['Embarked'].mode()[0]
    titanic_clean['Embarked'].fillna(mode_embarked, inplace=True)
    print(f"Filled missing embarked with mode: {mode_embarked}")


In [None]:
# Check missing values after cleaning
print(f"\nMissing values after cleaning: {titanic_clean.isnull().sum()}")

# =====================================================
# 6. DATA FILTERING AND QUERYING
# =====================================================


In [None]:
print("6. DATA FILTERING AND QUERYING")

print("\n6.1 Basic Filtering:")

# Single condition
adults = titanic_clean[titanic_clean['Age'] >= 18]
print(f"Number of adults (Age >= 18): {len(adults)}")

In [None]:
# Display first few adult passengers
print("First 3 adult passengers:")
print(adults[['Age', 'Sex', 'Survived']].head(3))


In [None]:
# Multiple conditions using & (and) and | (or)

young_females = titanic_clean[(titanic_clean['Age'] < 30) & (titanic_clean['Sex'] == 'female')]
print(f"\nYoung females (Age < 30): {len(young_females)}")


In [None]:
# High fare or first class
luxury_passengers = titanic_clean[(titanic_clean['Fare'] > 50) | (titanic_clean['Pclass'] == 1)]
print(f"Luxury passengers (fare > 50 OR first class): {len(luxury_passengers)}")


In [None]:
print("\n6.2 Using isin() Method:")
print("-" * 30)

# Filter using isin()
european_ports = titanic_clean[titanic_clean['Embarked'].isin(['C', 'S'])]
print(f"Passengers from European ports (C, S): {len(european_ports)}")

In [None]:
print("\n6.3 Query Method:")
print("-" * 30)

# Using query method for complex conditions
query_result = titanic_clean.query('Age > 30 and Fare < 20')
print(f"Passengers with age > 30 and fare < 20: {len(query_result)}")

# Query with string conditions
if 'Sex' in titanic_clean.columns:
    female_survivors = titanic_clean.query('Sex == "female" and Survived == 1')
    print(f"Female survivors: {len(female_survivors)}")

In [None]:
# =============================================================================
# 7. DATA AGGREGATION AND GROUPING
# =============================================================================

print("\n" + "="*60)
print("7. DATA AGGREGATION AND GROUPING")
print("="*60)

print("\n7.1 Basic Aggregations:")
print("-" * 30)

# Basic statistics
if 'Age' in titanic_clean.columns:
    print(f"Mean age: {titanic_clean['Age'].mean():.1f}")
    print(f"Median age: {titanic_clean['Age'].median():.1f}")
    print(f"Standard deviation of age: {titanic_clean['Age'].std():.1f}")

if 'Fare' in titanic_clean.columns:
    print(f"Average fare: ${titanic_clean['Fare'].mean():.2f}")
    print(f"Maximum fare: ${titanic_clean['Fare'].max():.2f}")

In [None]:
print("\n7.2 Value Counts:")
print("-" * 30)

# Count unique values
if 'Sex' in titanic_clean.columns:
    print("Sex distribution:")
    print(titanic_clean['Sex'].value_counts())

if 'Pclass' in titanic_clean.columns:
    print("\nClass distribution:")
    print(titanic_clean['Pclass'].value_counts().sort_index())

# Proportions
if 'Survived' in titanic_clean.columns:
    print("\nSurvival rate:")
    survival_rate = titanic_clean['Survived'].value_counts(normalize=True)
    print(survival_rate)

In [None]:
print("\n7.3 GroupBy Operations:")
print("-" * 30)

# Group by single column
if 'Sex' in titanic_clean.columns and 'Age' in titanic_clean.columns:
    age_by_sex = titanic_clean.groupby('Sex')['Age'].mean()
    print("Average age by sex:")
    print(age_by_sex)

# Group by multiple columns
if 'Sex' in titanic_clean.columns and 'Pclass' in titanic_clean.columns:
    survival_by_sex_class = titanic_clean.groupby(['Sex', 'Pclass'])['Survived'].mean()
    print("\nSurvival rate by sex and class:")
    print(survival_by_sex_class)

In [None]:
# Multiple aggregations
print("\n7.4 Multiple Aggregation Functions:")
print("-" * 30)

if 'Fare' in titanic_clean.columns and 'Pclass' in titanic_clean.columns:
    fare_stats = titanic_clean.groupby('Pclass')['Fare'].agg(['mean', 'median', 'std', 'count'])
    print("Fare statistics by class:")
    print(fare_stats)

In [None]:
# =============================================================================
# 8. DATA MERGING AND JOINING
# =============================================================================

print("\n" + "="*60)
print("8. DATA MERGING AND JOINING")
print("="*60)

# Create sample datasets for merging
print("\n8.1 Creating Sample Data for Merging:")
print("-" * 30)

# Passenger details
passenger_details = pd.DataFrame({
    'passenger_id': [1, 2, 3, 4, 5],
    'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince', 'Eve Wilson'],
    'ticket_class': [1, 2, 3, 1, 2]
})

# Ticket information
ticket_info = pd.DataFrame({
    'passenger_id': [1, 2, 3, 6, 7],
    'ticket_number': ['A123', 'B456', 'C789', 'D012', 'E345'],
    'fare_paid': [100, 75, 50, 120, 80]
})

print("Passenger Details:")
print(passenger_details)
print("\nTicket Information:")
print(ticket_info)

In [None]:
print("\n8.2 Different Types of Merges:")
print("-" * 30)

# Inner join (default)
inner_merge = pd.merge(passenger_details, ticket_info, on='passenger_id')
print("Inner Join (only matching records):")
print(inner_merge)

# Left join
left_merge = pd.merge(passenger_details, ticket_info, on='passenger_id', how='left')
print("\nLeft Join (all records from left table):")
print(left_merge)

# Right join
right_merge = pd.merge(passenger_details, ticket_info, on='passenger_id', how='right')
print("\nRight Join (all records from right table):")
print(right_merge)

# Outer join
outer_merge = pd.merge(passenger_details, ticket_info, on='passenger_id', how='outer')
print("\nOuter Join (all records from both tables):")
print(outer_merge)

In [None]:
print("\n8.3 Merge with Different Column Names:")
print("-" * 30)

# Create data with different column names
passenger_info = pd.DataFrame({
    'id': [1, 2, 3],
    'passenger_name': ['Alice', 'Bob', 'Charlie']
})

booking_info = pd.DataFrame({
    'passenger_id': [1, 2, 4],
    'booking_date': ['2024-01-01', '2024-01-02', '2024-01-03']
})

# Merge with different column names
merge_diff_names = pd.merge(passenger_info, booking_info,
                          left_on='id', right_on='passenger_id', how='inner')
print("Merge with different column names:")
print(merge_diff_names)

In [None]:
print("\n8.4 Concatenation:")
print("-" * 30)

# Concatenate DataFrames vertically
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

concat_vertical = pd.concat([df1, df2], ignore_index=True)
print("Vertical concatenation:")
print(concat_vertical)

# Concatenate horizontally
concat_horizontal = pd.concat([df1, df2], axis=1)
print("\nHorizontal concatenation:")
print(concat_horizontal)

In [None]:
# =============================================================================
# 9. PIVOT TABLES AND RESHAPING
# =============================================================================

print("\n" + "="*60)
print("9. PIVOT TABLES AND RESHAPING")
print("="*60)

print("\n9.1 Pivot Tables:")
print("-" * 30)

# Create a pivot table
if 'Sex' in titanic_clean.columns and 'Pclass' in titanic_clean.columns and 'Survived' in titanic_clean.columns:
    pivot_survival = pd.pivot_table(
        titanic_clean,
        values='Survived',
        index='Sex',
        columns='Pclass',
        aggfunc='mean',
        fill_value=0
    )
    print("Survival rate by sex and class (Pivot Table):")
    print(pivot_survival)


In [None]:

# Pivot table with multiple values
if 'Age' in titanic_clean.columns and 'Fare' in titanic_clean.columns:
    pivot_multiple = pd.pivot_table(
        titanic_clean,
        values=['Age', 'Fare'],
        index='Sex',
        columns='Pclass',
        aggfunc='mean',
        fill_value=0
    )
    print("\nAge and Fare by sex and class:")
    print(pivot_multiple)

In [None]:
# =============================================================================
# 10. ADVANCED OPERATIONS
# =============================================================================

print("\n" + "="*60)
print("10. ADVANCED OPERATIONS")
print("="*60)

print("\n10.1 Index Operations:")
print("-" * 30)

# Set index
if 'Age' in titanic_clean.columns:
    df_with_index = titanic_clean.copy()
    df_with_index = df_with_index.set_index('Age')
    print(f"Shape after setting age as index: {df_with_index.shape}")
    print("First few rows with age as index:")
    print(df_with_index.head(3))

    # Reset index
    df_reset = df_with_index.reset_index()
    print(f"\nShape after resetting index: {df_reset.shape}")

In [None]:
print("\n10.2 Apply Lambda Functions:")
print("-" * 30)

# Apply function to create new columns
if 'Age' in titanic_clean.columns:
    titanic_clean['Age_group'] = titanic_clean['Age'].apply(
        lambda x: 'Child' if x < 18 else ('Adult' if x < 60 else 'Senior')
    )

    print("Age group distribution:")
    print(titanic_clean['Age_group'].value_counts())

# Apply function to multiple columns
def categorize_passenger(row):
    if row['Pclass'] == 1:
        return 'First Class'
    elif row['Pclass'] == 2:
        return 'Second Class'
    else:
        return 'Third Class'

titanic_clean['class_name'] = titanic_clean.apply(categorize_passenger, axis=1)
print("\nClass name distribution:")
print(titanic_clean['class_name'].value_counts())

In [None]:
print("\n10.3 Working with Dates:")
print("-" * 30)

# Create sample date data
date_range = pd.date_range(start='2024-01-01', end='2024-01-10', freq='D')
date_df = pd.DataFrame({
    'date': date_range,
    'value': np.random.randn(len(date_range))
})

print("Sample date data:")
print(date_df.head())

# Extract date components
date_df['year'] = date_df['date'].dt.year
date_df['month'] = date_df['date'].dt.month
date_df['dayofweek'] = date_df['date'].dt.dayofweek
date_df['weekday_name'] = date_df['date'].dt.day_name()

print("\nDate with extracted components:")
print(date_df[['date', 'year', 'month', 'dayofweek', 'weekday_name']].head())

In [None]:
print("\n10.4 Data Validation and Quality Checks:")
print("-" * 30)

# Check for duplicates
duplicates = titanic_clean.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Check data ranges
if 'Age' in titanic_clean.columns:
    invalid_ages = titanic_clean[(titanic_clean['Age'] < 0) | (titanic_clean['Age'] > 150)]
    print(f"Invalid ages (< 0 or > 150): {len(invalid_ages)}")

# Memory usage
print(f"\nDataset memory usage: {titanic_clean.memory_usage(deep=True).sum() / 1024:.2f} KB")

In [None]:
print("\n" + "="*60)
print("SUMMARY AND BEST PRACTICES")
print("="*60)

print("""
Key Pandas Concepts Covered:
1. Series and DataFrame creation and manipulation
2. Data loading and exploration
3. Data selection and indexing (iloc, loc)
4. Missing value handling (dropna, fillna)
5. Data filtering and querying (isin, query)
6. Aggregation and grouping operations
7. Data merging and joining
8. Pivot tables and data reshaping
9. Advanced operations (apply, dates)

Next Steps:
- Practice with different datasets
""")

# Final dataset summary
print(f"\nFinal cleaned dataset shape: {titanic_clean.shape}")
print(f"Columns: {titanic_clean.columns.tolist()}")
print("\nData cleaning complete! Dataset ready for analysis.")