# More on Missing Data

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(3)

In [None]:
# Read in example dataset (fabricated for this demo)
df = pd.read_csv('gender_wage_full_data.csv')

In [None]:
# Make the missing data

# MCAR, Randomly remove 15% of salaries
n = df.shape[0]
data_random_missing = df.copy()
mask = np.random.rand(n) < 0.15
data_random_missing.loc[mask, 'salary'] = np.nan
data_random_missing.head(10)

# MAR, young females more likely to skip salary question
missing_based_on_gender_age = df.copy()
mask = ((missing_based_on_gender_age['gender'] == 'female') 
        & (missing_based_on_gender_age['age'] < 30) 
        & (np.random.rand(n) < 0.5))
missing_based_on_gender_age.loc[mask, 'salary'] = np.nan

# MNAR, lower salaries more likely to skip salary question
missing_based_on_salary = df.copy()
mask = (missing_based_on_salary['salary'] < 55000) & (np.random.rand(n) < 0.7)
missing_based_on_salary.loc[mask, 'salary'] = np.nan

### Checking for patterns in missingness

In [None]:
# Create a "missing salary" column
data_random_missing['missing_salary'] = 
missing_based_on_gender_age['missing_salary'] = 
missing_based_on_salary['missing_salary'] = 

In [None]:
# check relationship between missingness and gender, MCAR


In [None]:
# check relationship between missingness and age, MCAR


In [None]:
# check relationship between missingness and gender, MAR


In [None]:
# check relationship between missingness and age, MAR


In [None]:
# check relationship between missingness and gender, MNAR


In [None]:
# check relationship between missingness and age, MNAR


Use `missingno` for specialized visualizations.

In [None]:
# Can visualize the proportion of non-missing values in each column


In [None]:
# Another way to make the null map with the msno module 


In [None]:
# Plot the matrix again sorted by age


This makes it clear that only younger individuals skipped the salary question.

This reveals that missingness is be correlated with gender too.

### Imputing the data

In [None]:
# Fill MCAR the mean
MCAR_filled = 

In [None]:
# Compare qualitative differences between the complete data and the filled data
fig, ax = plt.subplots(1,2, figsize = (10,4))


In [None]:
# Fill MAR with the mean
MAR_filled = missing_based_on_gender_age.fillna(
    missing_based_on_gender_age.salary.mean())

In [None]:
# Compare qualitative differences between the complete data and the filled data
fig, ax = plt.subplots(1,2, figsize = (10,4))
sns.scatterplot(data = df, x = 'age', y = 'salary', hue = 'gender', ax=ax[0]);
sns.scatterplot(data = MAR_filled, x = 'age', y = 'salary', hue = 'gender', ax=ax[1]);

In [None]:
# Fill MNAR with the mean
MNAR_filled = missing_based_on_salary.fillna(
    missing_based_on_salary.salary.mean())

In [None]:
# Compare qualitative differences between the complete data and the filled data
fig, ax = plt.subplots(1,2, figsize = (10,4))
sns.scatterplot(data = df, x = 'age', y = 'salary', hue = 'gender', ax=ax[0]);
sns.scatterplot(data = MNAR_filled, x = 'age', y = 'salary', hue = 'gender', ax=ax[1]);

### Imputing time series data
Imputing with the mean, median, or mode is not always the best option, especially for sequential data. 

In [None]:
airquality = 
airquality

In [None]:
airquality['Date'] = 
airquality

In [None]:
# Look at missing Ozone data
plt.figure(figsize=(10,4))

plt.ylabel('Ozone')
plt.show()

The time series varies a lot. If we fill everything with the mean or median, we might get strange results. 

In [None]:
# Fill nans with the mean datapoint
mean_fill = 

plt.figure(figsize=(10,4))
plt.plot(mean_fill["Date"], mean_fill["Ozone"], marker='o', color = 'r');
plt.plot(airquality["Date"], airquality["Ozone"], marker='o');
plt.ylabel('Ozone')
plt.show()

In [None]:
# Fill nans with the previous datapoint
forward_fill = 

In [None]:
# Look at the missing data in Ozone
plt.figure(figsize=(10,4))
plt.plot(forward_fill["Date"], forward_fill["Ozone"], marker='o', color = 'r');
plt.plot(airquality["Date"], airquality["Ozone"], marker='o');
plt.ylabel('Ozone')
plt.show()

In [None]:
# Fill nans with the next datapoint
back_fill = 

In [None]:
# Look at the missing data in Ozone
plt.figure(figsize=(10,4))
plt.plot(back_fill["Date"], back_fill["Ozone"], marker='o', color = 'r');
plt.plot(airquality["Date"], airquality["Ozone"], marker='o');
plt.ylabel('Ozone')
plt.show()

In [None]:
# Linear interpolation
interpolated_oz = 
interpolated_oz['Ozone'] = 

In [None]:
# Look at the missing data in Ozone
plt.figure(figsize=(10,4))
plt.plot(interpolated_oz["Date"], interpolated_oz["Ozone"], marker='o', color = 'r');
plt.plot(airquality["Date"], airquality["Ozone"], marker='o');
plt.xlabel('Day')
plt.ylabel('Ozone')
plt.show()

This looks like a reasonable guess for how to fill in the datapoints.

*NOTE:* There are other methods that can be used to interpolate.

In [None]:
# Quadratic interpolation
interpolated_oz['Ozone'] = 

In [None]:
# Look at the missing data in Ozone
plt.figure(figsize=(10,4))
interpolated_oz['Ozone'].plot(color = 'r', marker='o')
airquality['Ozone'].plot( marker='o')
plt.xlabel('Day')
plt.ylabel('Ozone')
plt.show()

This method highly overshot the data ranges. This is not a good option. 

One other option is to fill null values with a random sample. This could work when the data is MCAR or MAR and could also work for categorical data.

In [None]:
# Get non-NaN values from column 'A'
non_nan_values = airquality['Ozone'].dropna()

# Count the number of NaN values in column 'A'
nan_count = airquality['Ozone'].isna().sum()

# Generate random samples from non-NaN values with replacement to fill NaNs
random_samples = np.random.choice(non_nan_values, nan_count, replace=True)

# Fill NaN values in column 'A' with random samples
random_fill = airquality.copy()
random_fill.loc[random_fill['Ozone'].isna(), 'Ozone'] = random_samples

plt.figure(figsize=(10,4))
plt.plot(random_fill["Date"], random_fill["Ozone"], marker='o', color = 'r');
plt.plot(airquality["Date"], airquality["Ozone"], marker='o');
plt.xlabel('Day')
plt.ylabel('Ozone')
plt.show()

## Activity
Activity 1: Look at the "Hours.Of.Daylight" column in the airquality dataset. Fill in the missing values with the most appropriate method.

**Activity 2:** Run the following cells to import and clean the Lake Mendocino Data from last time. Use an appropriate imputation method to fill in the null values in this dataset. Plot your results to evaluate if your imputation was reasonable.

In [None]:
# Import a dataset about Lake Mendocino
lake = pd.read_csv('coy_wy2024_csvdata.csv')
lake

In [None]:
lake[lake=='-'] = np.nan
notes_columns = [col for col in lake.columns if 'notes' in col]
lake[notes_columns] = lake[notes_columns].replace(0, np.nan)
lake.dropna(axis = 1, thresh = 230,inplace=True)
lake = lake.assign(cons_high = lake['Top of Conservation High (ac-ft)'].astype(float),
              cons = lake['Top of Conservation (ac-ft)'].astype(float),
              gross_pool = lake['Gross Pool'].astype(float),
              gross_pool_elev = lake['Gross Pool(elev)'].astype(float))
lake['date'] = pd.to_datetime(lake['ISO 8601 Date Time'].str[:10])
lake.drop(columns = ['Top of Conservation High (ac-ft)', 'Top of Conservation (ac-ft)','Gross Pool','Gross Pool(elev)'],inplace=True)
lake

In [None]:
# Fill nans


In [None]:
# Check if it worked 


In [None]:
# Plot 
