In [10]:
import numpy as np
import pandas as pd

pd.set_option("display.width", 120)
pd.set_option("display.max_columns", 20)
pd.set_option("display.precision", 4)

rng = np.random.default_rng(42)

# 1.1 Create a Series of stock prices with DatetimeIndex
today = pd.to_datetime('today').normalize()
dates = pd.date_range(end=today, periods=7, freq='D')
stock_prices = pd.Series([100, 102, 101, 105, 103, 106, 108], index=dates)

print("Stock Prices:")
display(stock_prices)

daily_returns = stock_prices.pct_change()
print("\nDaily Percentage Change:")
display(daily_returns)

max_increase_day = daily_returns.idxmax()
max_drop_day = daily_returns.idxmin()

print(f"\nDay with Maximum Increase: {max_increase_day.date()}")
print(f"Day with Maximum Drop: {max_drop_day.date()}")

# 1.2 Demonstrate .loc and .iloc
print("\nDemonstrating .loc and .iloc:")
# Label-based selection with .loc
loc_selection = stock_prices.loc[dates[-2]]
print(f"\n.loc selection (by label {dates[-2].date()}): {loc_selection}")

# Position-based selection with .iloc
iloc_selection = stock_prices.iloc[-2]
print(f".iloc selection (by position -2): {iloc_selection}")

# 2.1 Construct an employee DataFrame
employee_data = {
    'Department': ['Sales', 'Marketing', 'Sales', 'Engineering', 'Marketing', 'Engineering', 'Sales'],
    'Salary': [50000, 60000, 55000, 80000, 62000, 85000, 53000],
    'Years_of_Experience': [2.5, 5.0, 3.0, 10.0, 4.5, 12.0, 2.0]
}
employees_df = pd.DataFrame(employee_data, index=['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'])

print("Employee DataFrame:")
display(employees_df)

print("\nDescription of numeric columns:")
display(employees_df.describe())

dept_mean_salary = employees_df.groupby('Department')['Salary'].transform('mean')
employees_above_mean = employees_df[employees_df['Salary'] > dept_mean_salary]

print("\nEmployees above their department's mean salary:")
display(employees_above_mean)


# 2.2 Create a synthetic DataFrame for restaurant bills
bills_data = {
    'total_bill': rng.uniform(10, 50, 12),
    'tip': rng.uniform(1, 10, 12),
    'sex': rng.choice(['Female', 'Male'], 12),
    'smoker': rng.choice(['Yes', 'No'], 12),
    'day': rng.choice(['Thur', 'Fri', 'Sat', 'Sun'], 12),
    'time': rng.choice(['Lunch', 'Dinner'], 12),
    'size': rng.integers(1, 6, 12)
}
bills_df = pd.DataFrame(bills_data)

print("\nRestaurant Bills DataFrame:")
display(bills_df.head())
display(bills_df.tail())
display(bills_df.info())
display(bills_df.shape)

bills_df_renamed = bills_df.rename(columns={'total_bill': 'bill', 'time': 'meal'})
print("\nRestaurant Bills DataFrame with renamed columns:")
display(bills_df_renamed.head())

# 3.1 Select rows based on conditions
# Method 1: Boolean masking
condition_mask = (bills_df['tip'] >= 0.15 * bills_df['total_bill']) & \
                 (bills_df['day'] == "Sun") & \
                 (bills_df['size'] >= 3)
filtered_bills_mask = bills_df[condition_mask]

print("Filtered rows using boolean masking:")
display(filtered_bills_mask)

# Method 2: .query()
filtered_bills_query = bills_df.query('tip >= 0.15 * total_bill and day == "Sun" and size >= 3')

print("\nFiltered rows using .query():")
display(filtered_bills_query)

num_rows_satisfied = len(filtered_bills_mask)
print(f"\nNumber of rows satisfying the condition: {num_rows_satisfied}")

# 3.2 Demonstrate two different ways to select multiple columns
# Method 1: Using a list of column names
selected_columns_list = bills_df[['total_bill', 'tip', 'size']]
print("\nSelecting multiple columns using a list:")
display(selected_columns_list.head())

# Method 2: Using the .loc accessor with slicing
selected_columns_filter = bills_df.filter(items=['total_bill', 'tip', 'size'])
print("\nSelecting multiple columns using .filter():")
display(selected_columns_filter.head())

# 4.1 Introduce and handle missing data
# Introduce NaN in 'tip' for 20% of rows
np.random.seed(42) # for reproducibility
mask = np.random.choice([False, True], size=bills_df.shape[0], p=[0.8, 0.2])
bills_miss = bills_df.copy()
bills_miss.loc[mask, 'tip'] = np.nan

print("DataFrame with introduced missing values:")
display(bills_miss)

print("\nMissing values per column:")
display(bills_miss.isna().sum())

print("\nRows affected by missing 'tip' values:")
display(bills_miss[bills_miss['tip'].isna()])

mean_tip = bills_miss['tip'].mean()
bills_miss['tip_imputed_mean'] = bills_miss['tip'].fillna(mean_tip)

print("\nDataFrame with 'tip' imputed by mean:")
display(bills_miss)

bills_dropped = bills_miss.dropna(subset=['tip']).copy()

print("\nDataFrame with rows with NaN in 'tip' dropped:")
display(bills_dropped)

print(f"\nSize of imputed dataset: {bills_miss.shape[0]}")
print(f"Size of dropped dataset: {bills_dropped.shape[0]}")

# 4.2 Create a small inconsistency and standardize
bills_inconsistent = bills_df.copy()

bills_inconsistent.loc[bills_inconsistent['day'] == 'Sun', 'day'] = 'sun'

print("\nDataFrame with inconsistent 'day' values (before standardization):")
display(bills_inconsistent['day'].unique())

bills_inconsistent['day_standardized'] = bills_inconsistent['day'].str.title()

print("\nDataFrame with standardized 'day' values (after standardization):")
display(bills_inconsistent['day_standardized'].unique())

# 5.1 Group by 'day' and compute mean, std of numeric columns, and count of rows per group
grouped_stats = bills_miss.groupby('day')[['total_bill', 'tip_imputed_mean', 'size']].agg(['mean', 'std'])
grouped_count = bills_miss.groupby('day').size().rename('count')

# Combine stats and count into a single DataFrame
result_5_1 = grouped_stats.copy()
result_5_1['count'] = grouped_count

print("5.1 Grouped stats (mean, std) and count per day:")
print(result_5_1)

# 5.2 Define custom aggregation: peak-to-peak (ptp)
def ptp(x):
    return x.max() - x.min()

result_5_2 = bills_miss.groupby('day').agg({
    'total_bill': ptp,
    'tip_imputed_mean': ptp
})

print("\n5.2 Peak-to-peak range for total_bill and tip_imputed_mean per day:")
print(result_5_2)



# 6.1 Add a normalized column bill_norm = min-max scaling of total_bill to [0,1]
min_bill = bills_miss['total_bill'].min()
max_bill = bills_miss['total_bill'].max()
bills_miss['bill_norm'] = (bills_miss['total_bill'] - min_bill) / (max_bill - min_bill)

print("DataFrame with 'bill_norm' column:")
display(bills_miss)

# 6.2 Sort by day (ascending) and tip_imputed_mean (descending)
bills_sorted = bills_miss.sort_values(by=['day', 'tip_imputed_mean'], ascending=[True, False])

print("\nDataFrame sorted by 'day' and 'tip_imputed_mean':")
display(bills_sorted.head())


Stock Prices:


Unnamed: 0,0
2025-10-07,100
2025-10-08,102
2025-10-09,101
2025-10-10,105
2025-10-11,103
2025-10-12,106
2025-10-13,108



Daily Percentage Change:


Unnamed: 0,0
2025-10-07,
2025-10-08,0.02
2025-10-09,-0.0098
2025-10-10,0.0396
2025-10-11,-0.019
2025-10-12,0.0291
2025-10-13,0.0189



Day with Maximum Increase: 2025-10-10
Day with Maximum Drop: 2025-10-11

Demonstrating .loc and .iloc:

.loc selection (by label 2025-10-12): 106
.iloc selection (by position -2): 106
Employee DataFrame:


Unnamed: 0,Department,Salary,Years_of_Experience
Alice,Sales,50000,2.5
Bob,Marketing,60000,5.0
Charlie,Sales,55000,3.0
David,Engineering,80000,10.0
Eve,Marketing,62000,4.5
Frank,Engineering,85000,12.0
Grace,Sales,53000,2.0



Description of numeric columns:


Unnamed: 0,Salary,Years_of_Experience
count,7.0,7.0
mean,63571.4286,5.5714
std,13624.2081,3.899
min,50000.0,2.0
25%,54000.0,2.75
50%,60000.0,4.5
75%,71000.0,7.5
max,85000.0,12.0



Employees above their department's mean salary:


Unnamed: 0,Department,Salary,Years_of_Experience
Charlie,Sales,55000,3.0
Eve,Marketing,62000,4.5
Frank,Engineering,85000,12.0
Grace,Sales,53000,2.0



Restaurant Bills DataFrame:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,40.9582,6.7948,Male,No,Fri,Dinner,2
1,27.5551,8.4049,Male,No,Thur,Dinner,4
2,44.3439,4.9907,Male,Yes,Sat,Lunch,4
3,37.8947,3.0451,Female,No,Fri,Lunch,1
4,13.7671,5.9913,Female,Yes,Fri,Dinner,5


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
7,41.4426,6.685,Female,Yes,Sat,Dinner,1
8,15.1245,7.8228,Male,Yes,Sun,Dinner,4
9,28.0154,4.1907,Female,Yes,Fri,Lunch,4
10,24.8319,9.7363,Male,No,Thur,Dinner,4
11,47.0706,9.0381,Male,Yes,Sun,Lunch,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  12 non-null     float64
 1   tip         12 non-null     float64
 2   sex         12 non-null     object 
 3   smoker      12 non-null     object 
 4   day         12 non-null     object 
 5   time        12 non-null     object 
 6   size        12 non-null     int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 804.0+ bytes


None

(12, 7)


Restaurant Bills DataFrame with renamed columns:


Unnamed: 0,bill,tip,sex,smoker,day,meal,size
0,40.9582,6.7948,Male,No,Fri,Dinner,2
1,27.5551,8.4049,Male,No,Thur,Dinner,4
2,44.3439,4.9907,Male,Yes,Sat,Lunch,4
3,37.8947,3.0451,Female,No,Fri,Lunch,1
4,13.7671,5.9913,Female,Yes,Fri,Dinner,5


Filtered rows using boolean masking:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
8,15.1245,7.8228,Male,Yes,Sun,Dinner,4
11,47.0706,9.0381,Male,Yes,Sun,Lunch,4



Filtered rows using .query():


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
8,15.1245,7.8228,Male,Yes,Sun,Dinner,4
11,47.0706,9.0381,Male,Yes,Sun,Lunch,4



Number of rows satisfying the condition: 2

Selecting multiple columns using a list:


Unnamed: 0,total_bill,tip,size
0,40.9582,6.7948,2
1,27.5551,8.4049,4
2,44.3439,4.9907,4
3,37.8947,3.0451,1
4,13.7671,5.9913,5



Selecting multiple columns using .filter():


Unnamed: 0,total_bill,tip,size
0,40.9582,6.7948,2
1,27.5551,8.4049,4
2,44.3439,4.9907,4
3,37.8947,3.0451,1
4,13.7671,5.9913,5


DataFrame with introduced missing values:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,40.9582,6.7948,Male,No,Fri,Dinner,2
1,27.5551,,Male,No,Thur,Dinner,4
2,44.3439,4.9907,Male,Yes,Sat,Lunch,4
3,37.8947,3.0451,Female,No,Fri,Lunch,1
4,13.7671,5.9913,Female,Yes,Fri,Dinner,5
5,49.0249,1.5744,Female,Yes,Thur,Dinner,1
6,40.4456,8.4487,Female,No,Sat,Lunch,5
7,41.4426,,Female,Yes,Sat,Dinner,1
8,15.1245,7.8228,Male,Yes,Sun,Dinner,4
9,28.0154,4.1907,Female,Yes,Fri,Lunch,4



Missing values per column:


Unnamed: 0,0
total_bill,0
tip,3
sex,0
smoker,0
day,0
time,0
size,0



Rows affected by missing 'tip' values:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,27.5551,,Male,No,Thur,Dinner,4
7,41.4426,,Female,Yes,Sat,Dinner,1
11,47.0706,,Male,Yes,Sun,Lunch,4



DataFrame with 'tip' imputed by mean:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_imputed_mean
0,40.9582,6.7948,Male,No,Fri,Dinner,2,6.7948
1,27.5551,,Male,No,Thur,Dinner,4,5.8439
2,44.3439,4.9907,Male,Yes,Sat,Lunch,4,4.9907
3,37.8947,3.0451,Female,No,Fri,Lunch,1,3.0451
4,13.7671,5.9913,Female,Yes,Fri,Dinner,5,5.9913
5,49.0249,1.5744,Female,Yes,Thur,Dinner,1,1.5744
6,40.4456,8.4487,Female,No,Sat,Lunch,5,8.4487
7,41.4426,,Female,Yes,Sat,Dinner,1,5.8439
8,15.1245,7.8228,Male,Yes,Sun,Dinner,4,7.8228
9,28.0154,4.1907,Female,Yes,Fri,Lunch,4,4.1907



DataFrame with rows with NaN in 'tip' dropped:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_imputed_mean
0,40.9582,6.7948,Male,No,Fri,Dinner,2,6.7948
2,44.3439,4.9907,Male,Yes,Sat,Lunch,4,4.9907
3,37.8947,3.0451,Female,No,Fri,Lunch,1,3.0451
4,13.7671,5.9913,Female,Yes,Fri,Dinner,5,5.9913
5,49.0249,1.5744,Female,Yes,Thur,Dinner,1,1.5744
6,40.4456,8.4487,Female,No,Sat,Lunch,5,8.4487
8,15.1245,7.8228,Male,Yes,Sun,Dinner,4,7.8228
9,28.0154,4.1907,Female,Yes,Fri,Lunch,4,4.1907
10,24.8319,9.7363,Male,No,Thur,Dinner,4,9.7363



Size of imputed dataset: 12
Size of dropped dataset: 9

DataFrame with inconsistent 'day' values (before standardization):


array(['Fri', 'Thur', 'Sat', 'sun'], dtype=object)


DataFrame with standardized 'day' values (after standardization):


array(['Fri', 'Thur', 'Sat', 'Sun'], dtype=object)

5.1 Grouped stats (mean, std) and count per day:
     total_bill          tip_imputed_mean            size         count
           mean      std             mean     std    mean     std      
day                                                                    
Fri     30.1589  12.2441           5.0055  1.7010  3.0000  1.8257     4
Sat     42.0774   2.0252           6.4278  1.8014  3.3333  2.0817     3
Sun     31.0976  22.5893           6.8333  1.3993  4.0000  0.0000     2
Thur    33.8040  13.2518           5.7182  4.0824  3.0000  1.7321     3

5.2 Peak-to-peak range for total_bill and tip_imputed_mean per day:
      total_bill  tip_imputed_mean
day                               
Fri      27.1911            3.7496
Sat       3.8983            3.4580
Sun      31.9461            1.9789
Thur     24.1930            8.1619
DataFrame with 'bill_norm' column:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_imputed_mean,bill_norm
0,40.9582,6.7948,Male,No,Fri,Dinner,2,6.7948,0.7712
1,27.5551,,Male,No,Thur,Dinner,4,5.8439,0.3911
2,44.3439,4.9907,Male,Yes,Sat,Lunch,4,4.9907,0.8672
3,37.8947,3.0451,Female,No,Fri,Lunch,1,3.0451,0.6843
4,13.7671,5.9913,Female,Yes,Fri,Dinner,5,5.9913,0.0
5,49.0249,1.5744,Female,Yes,Thur,Dinner,1,1.5744,1.0
6,40.4456,8.4487,Female,No,Sat,Lunch,5,8.4487,0.7567
7,41.4426,,Female,Yes,Sat,Dinner,1,5.8439,0.7849
8,15.1245,7.8228,Male,Yes,Sun,Dinner,4,7.8228,0.0385
9,28.0154,4.1907,Female,Yes,Fri,Lunch,4,4.1907,0.4041



DataFrame sorted by 'day' and 'tip_imputed_mean':


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_imputed_mean,bill_norm
0,40.9582,6.7948,Male,No,Fri,Dinner,2,6.7948,0.7712
4,13.7671,5.9913,Female,Yes,Fri,Dinner,5,5.9913,0.0
9,28.0154,4.1907,Female,Yes,Fri,Lunch,4,4.1907,0.4041
3,37.8947,3.0451,Female,No,Fri,Lunch,1,3.0451,0.6843
6,40.4456,8.4487,Female,No,Sat,Lunch,5,8.4487,0.7567
