# Introduction to Pandas

## Learning Objectives
By the end of this notebook, you will be able to:
- Understand pandas DataFrames and Series
- Create DataFrames from various data sources
- Perform basic data exploration and inspection
- Select, filter, and manipulate data using pandas
- Apply fundamental data transformation operations

## 1. Introduction to Pandas DataStructures

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional
from datetime import datetime, date

# Display all columns and rows for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

# What is pandas?
print("\nPandas is a powerful data manipulation library that provides:")
print("✓ DataFrame and Series data structures")
print("✓ Data cleaning and transformation tools")
print("✓ File I/O operations (CSV, JSON, Excel, etc.)")
print("✓ Data aggregation and grouping capabilities")
print("✓ Time series analysis tools")
print("✓ Integration with other data science libraries")

In [None]:
# Creating pandas Series (1-dimensional labeled array)
print("=== Pandas Series ===")

# Series from list
temperatures: pd.Series = pd.Series([22.5, 24.1, 23.8, 25.2, 24.7])
print(f"Temperature Series:\n{temperatures}")
print(f"Data type: {temperatures.dtype}")
print(f"Shape: {temperatures.shape}")

# Series with custom index
cities: List[str] = ['New York', 'London', 'Tokyo', 'Sydney', 'Paris']
temperatures_indexed: pd.Series = pd.Series([22.5, 15.3, 28.1, 18.9, 20.4], index=cities)
print(f"\nTemperatures by city:\n{temperatures_indexed}")

# Series from dictionary
population_data: Dict[str, int] = {
    'New York': 8_400_000,
    'London': 9_000_000,
    'Tokyo': 14_000_000,
    'Sydney': 5_300_000,
    'Paris': 2_200_000
}

population_series: pd.Series = pd.Series(population_data)
print(f"\nCity populations:\n{population_series}")

# Basic Series operations
print(f"\nSeries operations:")
print(f"Mean temperature: {temperatures_indexed.mean():.2f}°C")
print(f"Max population: {population_series.max():,}")
print(f"Cities with population > 8M: {population_series[population_series > 8_000_000].index.tolist()}")

In [None]:
# Creating pandas DataFrames (2-dimensional labeled data structure)
print("=== Pandas DataFrames ===")

# DataFrame from dictionary
employee_data: Dict[str, List[Any]] = {
    'employee_id': ['EMP001', 'EMP002', 'EMP003', 'EMP004', 'EMP005'],
    'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Davis', 'Eve Wilson'],
    'department': ['Engineering', 'Marketing', 'Sales', 'Engineering', 'HR'],
    'salary': [75000, 65000, 70000, 80000, 60000],
    'hire_date': ['2022-01-15', '2021-06-01', '2023-03-10', '2020-11-20', '2022-08-05'],
    'is_active': [True, True, True, True, True]
}

df_employees: pd.DataFrame = pd.DataFrame(employee_data)
print(f"Employee DataFrame:\n{df_employees}")

# DataFrame info
print(f"\nDataFrame shape: {df_employees.shape}")
print(f"Column names: {df_employees.columns.tolist()}")
print(f"Index: {df_employees.index.tolist()}")
print(f"Data types:\n{df_employees.dtypes}")

In [None]:
# DataFrame from list of dictionaries (common in data engineering)
sales_records: List[Dict[str, Any]] = [
    {'transaction_id': 'TXN001', 'customer_id': 'CUST001', 'product': 'Laptop', 'quantity': 1, 'amount': 999.99, 'date': '2024-01-15'},
    {'transaction_id': 'TXN002', 'customer_id': 'CUST002', 'product': 'Mouse', 'quantity': 2, 'amount': 59.98, 'date': '2024-01-15'},
    {'transaction_id': 'TXN003', 'customer_id': 'CUST001', 'product': 'Keyboard', 'quantity': 1, 'amount': 79.99, 'date': '2024-01-16'},
    {'transaction_id': 'TXN004', 'customer_id': 'CUST003', 'product': 'Monitor', 'quantity': 1, 'amount': 299.99, 'date': '2024-01-16'},
    {'transaction_id': 'TXN005', 'customer_id': 'CUST002', 'product': 'Webcam', 'quantity': 1, 'amount': 89.99, 'date': '2024-01-17'}
]

df_sales: pd.DataFrame = pd.DataFrame(sales_records)
print(f"Sales DataFrame:\n{df_sales}")

# Convert date column to datetime
df_sales['date'] = pd.to_datetime(df_sales['date'])
print(f"\nAfter date conversion:\n{df_sales.dtypes}")

# DataFrame from numpy array
np.random.seed(42)
random_data: np.ndarray = np.random.randn(5, 3)
df_random: pd.DataFrame = pd.DataFrame(
    random_data, 
    columns=['A', 'B', 'C'],
    index=[f'Row_{i}' for i in range(1, 6)]
)
print(f"\nDataFrame from NumPy array:\n{df_random}")

## 2. Data Exploration and Inspection

In [None]:
# Create a more comprehensive dataset for exploration
def create_sample_dataset() -> pd.DataFrame:
    """
    Create a comprehensive sample dataset for exploration.
    
    Returns:
        DataFrame with sample business data
    """
    np.random.seed(42)
    
    n_records = 100
    
    # Generate sample data
    data = {
        'customer_id': [f'CUST{i:04d}' for i in range(1, n_records + 1)],
        'age': np.random.randint(18, 80, n_records),
        'gender': np.random.choice(['M', 'F', 'Other'], n_records, p=[0.45, 0.45, 0.1]),
        'city': np.random.choice(['New York', 'London', 'Tokyo', 'Sydney', 'Paris'], n_records),
        'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], n_records),
        'purchase_amount': np.round(np.random.uniform(10, 1000, n_records), 2),
        'discount_percent': np.round(np.random.uniform(0, 25, n_records), 1),
        'is_premium': np.random.choice([True, False], n_records, p=[0.3, 0.7]),
        'purchase_date': pd.date_range('2024-01-01', periods=n_records, freq='D')[:n_records]
    }
    
    df = pd.DataFrame(data)
    
    # Add some missing values for realistic data
    missing_indices = np.random.choice(df.index, size=5, replace=False)
    df.loc[missing_indices, 'discount_percent'] = np.nan
    
    return df

# Create sample dataset
df_sample: pd.DataFrame = create_sample_dataset()
print(f"Sample dataset created with {len(df_sample)} records")
print(f"\nFirst 5 rows:")
print(df_sample.head())

In [None]:
# Essential DataFrame inspection methods
print("=== DataFrame Inspection ===")

# Basic information
print(f"Shape: {df_sample.shape}")
print(f"\nColumn names: {df_sample.columns.tolist()}")
print(f"\nData types:")
print(df_sample.dtypes)

# Memory usage
print(f"\nMemory usage:")
print(df_sample.memory_usage(deep=True))

# Info method - comprehensive overview
print(f"\nDataFrame info:")
df_sample.info()

# Statistical summary
print(f"\nStatistical summary:")
print(df_sample.describe())

# Summary for non-numeric columns
print(f"\nNon-numeric column summary:")
print(df_sample.describe(include=['object']))

In [None]:
# Missing data analysis
print("=== Missing Data Analysis ===")

# Check for missing values
missing_counts = df_sample.isnull().sum()
print(f"Missing values per column:")
print(missing_counts[missing_counts > 0])

# Missing value percentage
missing_percentage = (df_sample.isnull().sum() / len(df_sample)) * 100
print(f"\nMissing value percentages:")
print(missing_percentage[missing_percentage > 0])

# Unique values analysis
print(f"\n=== Unique Values Analysis ===")
for column in df_sample.columns:
    unique_count = df_sample[column].nunique()
    print(f"{column}: {unique_count} unique values")

# Value counts for categorical columns
categorical_columns = ['gender', 'city', 'product_category']
for col in categorical_columns:
    print(f"\n{col} distribution:")
    print(df_sample[col].value_counts())

## 3. Data Selection and Filtering

In [None]:
# Column selection
print("=== Column Selection ===")

# Select single column (returns Series)
customer_ids: pd.Series = df_sample['customer_id']
print(f"Customer IDs (first 5): {customer_ids.head().tolist()}")

# Select single column (returns DataFrame)
customer_ids_df: pd.DataFrame = df_sample[['customer_id']]
print(f"\nCustomer IDs as DataFrame shape: {customer_ids_df.shape}")

# Select multiple columns
customer_info: pd.DataFrame = df_sample[['customer_id', 'age', 'gender', 'city']]
print(f"\nCustomer info (first 3 rows):")
print(customer_info.head(3))

# Select columns by data type
numeric_columns: pd.DataFrame = df_sample.select_dtypes(include=[np.number])
print(f"\nNumeric columns: {numeric_columns.columns.tolist()}")

categorical_columns_df: pd.DataFrame = df_sample.select_dtypes(include=['object'])
print(f"Categorical columns: {categorical_columns_df.columns.tolist()}")

In [None]:
# Row selection and filtering
print("=== Row Selection and Filtering ===")

# Select rows by index
first_5_rows: pd.DataFrame = df_sample.iloc[:5]
print(f"First 5 rows shape: {first_5_rows.shape}")

# Select specific rows by index
specific_rows: pd.DataFrame = df_sample.iloc[[0, 2, 4, 6, 8]]
print(f"\nSpecific rows (0,2,4,6,8):")
print(specific_rows[['customer_id', 'age', 'purchase_amount']])

# Boolean filtering
print(f"\n=== Boolean Filtering ===")

# Filter by single condition
high_value_customers: pd.DataFrame = df_sample[df_sample['purchase_amount'] > 500]
print(f"High-value customers (>$500): {len(high_value_customers)} records")

# Filter by multiple conditions
premium_electronics: pd.DataFrame = df_sample[
    (df_sample['is_premium'] == True) & 
    (df_sample['product_category'] == 'Electronics')
]
print(f"Premium electronics customers: {len(premium_electronics)} records")

# Filter using isin() method
major_cities: pd.DataFrame = df_sample[df_sample['city'].isin(['New York', 'London', 'Tokyo'])]
print(f"Customers in major cities: {len(major_cities)} records")

# Filter by string contains
cust_001_to_010: pd.DataFrame = df_sample[df_sample['customer_id'].str.contains('CUST000')]
print(f"Customers CUST0001-CUST0009: {len(cust_001_to_010)} records")

In [None]:
# Advanced selection with loc and iloc
print("=== Advanced Selection with loc and iloc ===")

# loc: label-based selection
# Select specific rows and columns
subset_loc: pd.DataFrame = df_sample.loc[0:4, ['customer_id', 'age', 'purchase_amount']]
print(f"Using loc (rows 0-4, specific columns):")
print(subset_loc)

# iloc: integer position-based selection
subset_iloc: pd.DataFrame = df_sample.iloc[0:5, [0, 1, 5]]  # First 5 rows, columns 0, 1, 5
print(f"\nUsing iloc (rows 0-4, columns 0,1,5):")
print(subset_iloc)

# Conditional selection with loc
young_customers: pd.DataFrame = df_sample.loc[
    df_sample['age'] < 30, 
    ['customer_id', 'age', 'city', 'purchase_amount']
]
print(f"\nYoung customers (<30 years): {len(young_customers)} records")
print(young_customers.head())

# Query method for complex filtering
query_result: pd.DataFrame = df_sample.query(
    'age >= 25 and age <= 45 and purchase_amount > 100'
)
print(f"\nQuery result (age 25-45, amount >$100): {len(query_result)} records")
print(query_result[['customer_id', 'age', 'purchase_amount']].head())

## 4. Basic Data Manipulation

In [None]:
# Adding new columns
print("=== Adding New Columns ===")

# Create a copy for manipulation
df_work: pd.DataFrame = df_sample.copy()

# Add calculated column
df_work['discount_amount'] = df_work['purchase_amount'] * (df_work['discount_percent'] / 100)
df_work['final_amount'] = df_work['purchase_amount'] - df_work['discount_amount']

print(f"Added discount_amount and final_amount columns")
print(df_work[['purchase_amount', 'discount_percent', 'discount_amount', 'final_amount']].head())

# Add categorical column based on conditions
def categorize_age(age: int) -> str:
    if age < 25:
        return 'Young'
    elif age < 45:
        return 'Middle'
    else:
        return 'Senior'

df_work['age_group'] = df_work['age'].apply(categorize_age)

print(f"\nAge group distribution:")
print(df_work['age_group'].value_counts())

# Add column using numpy where
df_work['customer_tier'] = np.where(
    df_work['is_premium'], 
    'Premium', 
    np.where(df_work['purchase_amount'] > 200, 'Gold', 'Standard')
)

print(f"\nCustomer tier distribution:")
print(df_work['customer_tier'].value_counts())

In [None]:
# Modifying existing columns
print("=== Modifying Existing Columns ===")

# String operations
df_work['city_upper'] = df_work['city'].str.upper()
df_work['customer_code'] = df_work['customer_id'].str.replace('CUST', 'C')

print(f"String transformations:")
print(df_work[['city', 'city_upper', 'customer_id', 'customer_code']].head())

# Numeric operations
df_work['purchase_amount_rounded'] = df_work['purchase_amount'].round(0)
df_work['age_normalized'] = (df_work['age'] - df_work['age'].min()) / (df_work['age'].max() - df_work['age'].min())

print(f"\nNumeric transformations:")
print(df_work[['purchase_amount', 'purchase_amount_rounded', 'age', 'age_normalized']].head())

# Date operations
df_work['purchase_year'] = df_work['purchase_date'].dt.year
df_work['purchase_month'] = df_work['purchase_date'].dt.month
df_work['purchase_day_name'] = df_work['purchase_date'].dt.day_name()

print(f"\nDate transformations:")
print(df_work[['purchase_date', 'purchase_year', 'purchase_month', 'purchase_day_name']].head())

In [None]:
# Sorting and ranking
print("=== Sorting and Ranking ===")

# Sort by single column
df_sorted_amount: pd.DataFrame = df_work.sort_values('purchase_amount', ascending=False)
print(f"Top 5 purchases by amount:")
print(df_sorted_amount[['customer_id', 'purchase_amount', 'product_category']].head())

# Sort by multiple columns
df_sorted_multi: pd.DataFrame = df_work.sort_values(['city', 'purchase_amount'], ascending=[True, False])
print(f"\nSorted by city (asc) then amount (desc):")
print(df_sorted_multi[['customer_id', 'city', 'purchase_amount']].head(10))

# Add ranking columns
df_work['amount_rank'] = df_work['purchase_amount'].rank(ascending=False, method='dense')
df_work['age_percentile'] = df_work['age'].rank(pct=True)

print(f"\nRanking examples:")
top_purchases = df_work.nsmallest(5, 'amount_rank')
print(top_purchases[['customer_id', 'purchase_amount', 'amount_rank', 'age', 'age_percentile']])

# Group ranking
df_work['city_amount_rank'] = df_work.groupby('city')['purchase_amount'].rank(ascending=False, method='dense')
print(f"\nTop purchase in each city:")
top_by_city = df_work[df_work['city_amount_rank'] == 1]
print(top_by_city[['customer_id', 'city', 'purchase_amount', 'city_amount_rank']])

## 5. Handling Missing Data

In [None]:
# Working with missing data
print("=== Handling Missing Data ===")

# Check current missing data
print(f"Missing values in discount_percent: {df_work['discount_percent'].isnull().sum()}")

# Different strategies for handling missing data

# 1. Fill with a constant value
df_fill_zero = df_work.copy()
df_fill_zero['discount_percent'] = df_fill_zero['discount_percent'].fillna(0)
print(f"\nAfter filling with 0: {df_fill_zero['discount_percent'].isnull().sum()} missing values")

# 2. Fill with mean/median
df_fill_mean = df_work.copy()
mean_discount = df_fill_mean['discount_percent'].mean()
df_fill_mean['discount_percent'] = df_fill_mean['discount_percent'].fillna(mean_discount)
print(f"After filling with mean ({mean_discount:.2f}): {df_fill_mean['discount_percent'].isnull().sum()} missing values")

# 3. Forward fill (use previous value)
df_ffill = df_work.copy()
df_ffill['discount_percent'] = df_ffill['discount_percent'].fillna(method='ffill')
print(f"After forward fill: {df_ffill['discount_percent'].isnull().sum()} missing values")

# 4. Fill based on group statistics
df_group_fill = df_work.copy()
df_group_fill['discount_percent'] = df_group_fill.groupby('product_category')['discount_percent'].transform(
    lambda x: x.fillna(x.mean())
)
print(f"After group-based fill: {df_group_fill['discount_percent'].isnull().sum()} missing values")

# 5. Drop rows with missing values
df_dropna = df_work.dropna(subset=['discount_percent'])
print(f"\nOriginal rows: {len(df_work)}, After dropping NaN: {len(df_dropna)}")

# Show comparison of different strategies
comparison_data = {
    'Original': df_work['discount_percent'].iloc[:10],
    'Fill_Zero': df_fill_zero['discount_percent'].iloc[:10],
    'Fill_Mean': df_fill_mean['discount_percent'].iloc[:10],
    'Group_Fill': df_group_fill['discount_percent'].iloc[:10]
}

comparison_df = pd.DataFrame(comparison_data)
print(f"\nComparison of missing data strategies (first 10 rows):")
print(comparison_df)