## Imports

In [2]:
import pandas as pd
import numpy as np

## DataFrame Merge Key Mismatches

In [3]:
# Create sample dataframes
sales_df = pd.DataFrame({
    'customer_id': [101, 102, 103, 104],
    'sale_amount': [1500, 2300, 1800, 3200]
})

customer_df = pd.DataFrame({
    'CustomerID': [101, 102, 103, 105],  # Note the different column name and slightly different data
    'customer_name': ['Alice', 'Bob', 'Charlie', 'Eve']
})

# merged_df = sales_df.merge(customer_df, left_on='customer_id', right_on='customer_id')
try:
    # This will raise an error
    merged_df = sales_df.merge(customer_df, left_on='customer_id', right_on='customer_id')
except KeyError as e:
    print("KeyError:", e)


KeyError: 'customer_id'


In [4]:
# Correct solution
merged_df = sales_df.merge(customer_df, left_on='customer_id', right_on='CustomerID')
print(merged_df)

   customer_id  sale_amount  CustomerID customer_name
0          101         1500         101         Alice
1          102         2300         102           Bob
2          103         1800         103       Charlie


## Mixed Data Types in Operations

In [5]:
# Create sample dataframe with mixed types
mixed_df = pd.DataFrame({
    'value': ['100', '200', 'NA', '400', '500']
})

try:
    # This will raise an error
    result = mixed_df['value'].mean()
except TypeError as e:
    print("TypeError:", e)

TypeError: Could not convert string '100200NA400500' to numeric


In [6]:
# Correct solution
mixed_df['value'] = pd.to_numeric(mixed_df['value'], errors='coerce')
result = mixed_df['value'].mean()
result

300.0


## The SettingWithCopyWarning

In [8]:
# Create sample dataframe
data = pd.DataFrame({
    'category': ['A', 'A', 'B', 'B', 'C'],
    'value': [1, 2, 3, 4, 5]
})

# This will trigger a warning
subset_data = data[data['category'] == 'A']
subset_data['value'] = subset_data['value'] * 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_data['value'] = subset_data['value'] * 2


In [9]:
# Correct solution
subset_data = data[data['category'] == 'A'].copy()
subset_data['value'] = subset_data['value'] * 2

## NaN Propagation in Calculations

In [15]:
# Create sample dataframe with NaN values
finance_df = pd.DataFrame({
    'revenue': [1000, 2000, np.nan, 4000],
    'costs': [500, np.nan, 1500, 2000]
})

# This will give unexpected results
profit = finance_df['revenue'] - finance_df['costs']
print(profit)

0     500.0
1       NaN
2       NaN
3    2000.0
dtype: float64


In [16]:
profit = finance_df['revenue'].fillna(0) - finance_df['costs'].fillna(0)
print(profit)

0     500.0
1    2000.0
2   -1500.0
3    2000.0
dtype: float64


## Index Alignment Issues

In [24]:
# Create sample dataframes with different indices
df_1 = pd.DataFrame({'value': [1, 2, 3]}, index=['A', 'B', 'C'])
df_2 = pd.DataFrame({'value': [4, 5, 6]}, index=['B', 'C', 'D'])

try:
    # This might give unexpected results
    result = df_1['value'] + df_2['value']
except Exception as e:
    print("Exception:", e)

print(result)

A    NaN
B    6.0
C    8.0
D    NaN
Name: value, dtype: float64


In [25]:
result = df_1['value'].add(df_2['value'], fill_value=0)
print(result)

A    1.0
B    6.0
C    8.0
D    6.0
Name: value, dtype: float64


## Memory Issues with Large DataFrames

In [23]:
def simulate_memory_issue():
    # Create a large dataframe (this is a small example)
    big_df = pd.DataFrame(np.random.randn(1000000, 10))

    # Inefficient way (creates multiple copies)
    processed_df = big_df
    for col in big_df.columns:
        processed_df = processed_df[processed_df[col] > 0]

    return processed_df

proc_df = simulate_memory_issue()

In [None]:
def a_better_processing_func():
    # Create a large dataframe (this is a small example)
    big_df = pd.DataFrame(np.random.randn(1000000, 10))

    # Efficient solution (chain operations)
    mask = (big_df > 0).all(axis=1)
    processed_df = big_df[mask]

    return processed_df