In [1]:
# Pandas Indexing Exercise Notebook
# This notebook contains comprehensive exercises for mastering Pandas indexing techniques

import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
# Load sample datasets from seaborn
print("Loading datasets...")

# Dataset 1: Tips dataset
tips = sns.load_dataset('tips')
print("Tips dataset shape:", tips.shape)
print("Tips dataset columns:", tips.columns.tolist())
print("\nFirst few rows of tips:")
print(tips.head())

# Dataset 2: Flights dataset
flights = sns.load_dataset('flights')
print("\n" + "="*50)
print("Flights dataset shape:", flights.shape)
print("Flights dataset columns:", flights.columns.tolist())
print("\nFirst few rows of flights:")
print(flights.head())

# Dataset 3: Titanic dataset
titanic = sns.load_dataset('titanic')
print("\n" + "="*50)
print("Titanic dataset shape:", titanic.shape)
print("Titanic dataset columns:", titanic.columns.tolist())
print("\nFirst few rows of titanic:")
print(titanic.head())

print("\n" + "="*70)
print("INDEXING EXERCISES")
print("="*70)

Loading datasets...
Tips dataset shape: (244, 7)
Tips dataset columns: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

First few rows of tips:
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4

Flights dataset shape: (144, 3)
Flights dataset columns: ['year', 'month', 'passengers']

First few rows of flights:
   year month  passengers
0  1949   Jan         112
1  1949   Feb         118
2  1949   Mar         132
3  1949   Apr         129
4  1949   May         121

Titanic dataset shape: (891, 15)
Titanic dataset columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']

First few rows of titanic:
   survived

In [None]:
# =============================================================================
# EXERCISE 1: BASIC INTEGER INDEXING (.iloc)
# =============================================================================
print("\nEXERCISE 1: Basic Integer Indexing with .iloc")
print("-" * 50)

# TODO: Select the first 5 rows of the tips dataset
first_five_tips = None  # Replace with your code using .iloc

# TODO: Select the last 3 rows of the flights dataset
last_three_flights = None  # Replace with your code using .iloc

# TODO: Select rows 10-15 (inclusive) from the titanic dataset
middle_titanic = None  # Replace with your code using .iloc

# TODO: Select every 10th row from the tips dataset
every_tenth_tips = None  # Replace with your code using .iloc

# TODO: Select the 2nd, 5th, and 8th rows from the flights dataset
specific_flights = None  # Replace with your code using .iloc


In [None]:
# =============================================================================
# EXERCISE 2: BASIC LABEL INDEXING (.loc)
# =============================================================================
print("\nEXERCISE 2: Basic Label Indexing with .loc")
print("-" * 50)

# First, let's set up some DataFrames with meaningful indices
tips_indexed = tips.set_index('day')
flights_pivot = flights.pivot_table(values='passengers', index='year', columns='month', aggfunc='sum')

# TODO: Select all rows where day is 'Sat' from tips_indexed
saturday_tips = None  # Replace with your code using .loc

# TODO: Select data for years 1950-1955 from flights_pivot
early_years_flights = None  # Replace with your code using .loc

# TODO: Select multiple specific indices from tips_indexed (e.g., 'Fri' and 'Sun')
weekend_tips = None  # Replace with your code using .loc

In [None]:
# =============================================================================
# EXERCISE 3: COLUMN SELECTION
# =============================================================================
print("\nEXERCISE 3: Column Selection")
print("-" * 50)

# TODO: Select only the 'total_bill' and 'tip' columns from tips
bill_and_tip = None  # Replace with your code

# TODO: Select columns from 'survived' to 'sex' from titanic (inclusive)
titanic_subset = None  # Replace with your code using .loc

# TODO: Select every other column from the flights dataset
alternate_columns = None  # Replace with your code using .iloc

# TODO: Select columns that contain 'bill' in their name from tips
bill_columns = None  # Replace with your code (hint: use filter or list comprehension)


In [None]:
# =============================================================================
# EXERCISE 4: BOOLEAN INDEXING
# =============================================================================
print("\nEXERCISE 4: Boolean Indexing")
print("-" * 50)

# TODO: Select all rows where total_bill > 30 from tips
expensive_meals = None  # Replace with your code

# TODO: Select all rows where sex is 'Female' and survived is 1 from titanic
female_survivors = None  # Replace with your code

# TODO: Select flights data where passengers > 400
busy_flights = None  # Replace with your code

# TODO: Select tips where day is 'Sat' or 'Sun' AND tip > 5
weekend_big_tips = None  # Replace with your code

# TODO: Select titanic passengers who are either in first class OR children (age < 16)
first_class_or_children = None  # Replace with your code

In [None]:
# =============================================================================
# EXERCISE 5: MULTI-LEVEL INDEXING
# =============================================================================
print("\nEXERCISE 5: Multi-level Indexing")
print("-" * 50)

# Create a multi-index DataFrame
tips_multi = tips.set_index(['day', 'time'])
titanic_multi = titanic.set_index(['class', 'sex'])

print("Tips multi-index structure:")
print(tips_multi.index.names)
print("\nFirst few rows of tips_multi:")
print(tips_multi.head())

# TODO: Select all 'Dinner' time entries for 'Sat' from tips_multi
sat_dinner = None  # Replace with your code using .loc

# TODO: Select all 'first' class passengers from titanic_multi
first_class = None  # Replace with your code using .loc

# TODO: Select 'male' passengers from 'third' class in titanic_multi
third_class_males = None  # Replace with your code using .loc

# TODO: Select multiple combinations: ('Sat', 'Dinner') and ('Sun', 'Lunch') from tips_multi
specific_combinations = None  # Replace with your code using .loc

In [None]:
# =============================================================================
# EXERCISE 6: ADVANCED INDEXING COMBINATIONS
# =============================================================================
print("\nEXERCISE 6: Advanced Indexing Combinations")
print("-" * 50)

# TODO: From tips, select rows where total_bill > 20 and only 'total_bill', 'tip', 'day' columns
filtered_tips = None  # Replace with your code combining boolean indexing and column selection

# TODO: From titanic, select the first 100 rows and only numeric columns
titanic_numeric_subset = None  # Replace with your code (.iloc + select_dtypes or specific columns)

# TODO: From flights_pivot, select years 1955-1960 and months 6-9 (summer months)
summer_flights = None  # Replace with your code using .loc with both row and column selection

# TODO: Create a new DataFrame with tips where size >= 4, then select every other row
large_parties = tips[tips['size'] >= 4]
large_parties_alternate = None  # Replace with your code using .iloc

In [None]:
# =============================================================================
# EXERCISE 7: QUERY METHOD
# =============================================================================
print("\nEXERCISE 7: Using the .query() method")
print("-" * 50)

# TODO: Use .query() to select tips where total_bill > 25 and day == 'Sat'
query_tips = None  # Replace with your code using .query()

# TODO: Use .query() to select titanic passengers where age > 30 and fare < 50
query_titanic = None  # Replace with your code using .query()

# TODO: Use .query() with variables - create a variable min_passengers = 300, 
# then query flights where passengers > min_passengers
min_passengers = 300
query_flights = None  # Replace with your code using .query() with @min_passengers

In [None]:
# =============================================================================
# EXERCISE 8: SETTING VALUES WITH INDEXING
# =============================================================================
print("\nEXERCISE 8: Setting Values with Indexing")
print("-" * 50)

# Create copies to avoid modifying original data
tips_copy = tips.copy()
titanic_copy = titanic.copy()

# TODO: Set all tip values to 0 where total_bill < 10 in tips_copy
# Replace None with your indexing code
tips_copy.loc[None, 'tip'] = 0

# TODO: Create a new column 'tip_percentage' in tips_copy and calculate it
# (tip / total_bill * 100)
tips_copy.loc[:, 'tip_percentage'] = None  # Replace None with calculation

# TODO: Set age to the median age for all passengers where age is missing in titanic_copy
median_age = titanic_copy['age'].median()
# Replace None with your boolean indexing condition
titanic_copy.loc[None, 'age'] = median_age

In [None]:
# =============================================================================
# EXERCISE 9: CONDITIONAL SELECTION WITH .where()
# =============================================================================
print("\nEXERCISE 9: Conditional Selection with .where()")
print("-" * 50)

# TODO: Use .where() to show only tip values greater than 3, others should be NaN
tips_where = None  # Replace with your code using tips['tip'].where()

# TODO: Use .where() with an alternative value: show fare values, but replace values > 100 with 100
titanic_fare_capped = None  # Replace with your code using .where() with other parameter

# =============================================================================
# EXERCISE 10: SAMPLING AND RANDOM SELECTION
# =============================================================================
print("\nEXERCISE 10: Sampling and Random Selection")
print("-" * 50)

# Set seed for reproducibility
np.random.seed(42)

# TODO: Sample 10 random rows from tips
random_tips = None  # Replace with your code using .sample()

# TODO: Sample 5% of rows from titanic
sample_titanic = None  # Replace with your code using .sample() with frac parameter

# TODO: Sample 3 rows from each day in tips (grouped sampling)
grouped_sample = None  # Replace with your code using .groupby() and .sample()


In [None]:
# =============================================================================
# VALIDATION SECTION (Don't modify - this will check your answers)
# =============================================================================
print("\n" + "="*70)
print("VALIDATION SECTION")
print("="*70)

def validate_exercise(exercise_name, result, expected_condition, description):
    """Helper function to validate exercises"""
    try:
        if result is not None and expected_condition(result):
            print(f"✅ {exercise_name}: {description} - CORRECT")
        else:
            print(f"❌ {exercise_name}: {description} - NEEDS WORK")
    except Exception as e:
        print(f"❌ {exercise_name}: {description} - ERROR: {str(e)}")

# Add validation calls here (you can uncomment these after completing exercises)
"""
validate_exercise("Exercise 1.1", first_five_tips, lambda x: len(x) == 5, "First 5 rows")
validate_exercise("Exercise 1.2", last_three_flights, lambda x: len(x) == 3, "Last 3 rows")
validate_exercise("Exercise 4.1", expensive_meals, lambda x: all(x['total_bill'] > 30), "Total bill > 30")
validate_exercise("Exercise 4.2", female_survivors, lambda x: len(x) > 0 and all((x['sex'] == 'female') & (x['survived'] == 1)), "Female survivors")
"""

print("\n" + "="*70)
print("EXERCISE COMPLETION TIPS:")
print("="*70)
print("""
1. .iloc is for integer-position based indexing (0, 1, 2, ...)
2. .loc is for label-based indexing (using actual index values/column names)
3. Boolean indexing uses conditions like df[df['column'] > value]
4. .query() provides a string-based way to filter data
5. Multi-level indexing uses tuples for selection: .loc[('level1', 'level2')]
6. Combine row and column selection with .loc[rows, columns]
7. Use .where() for conditional replacement with NaN or other values
8. .sample() is great for getting random subsets of your data

Remember to run each section and check your results!
Good luck with your indexing practice!
""")