In [12]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.

import pandas as pd

# Create a sample DataFrame
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', None, 'David', 'Eve'],
    'Age': [25, None, 22, 30, None],
    'Salary': [50000, 60000, None, 70000, 80000]
}

df = pd.DataFrame(data)

# Step 2: Check for missing values
missing_values = df.isnull()

# Step 3: Summarize missing data
missing_summary = missing_values.sum()

# Display the summary of missing values
print("Summary of missing data in each column:")
print(missing_summary)


Summary of missing data in each column:
ID        0
Name      1
Age       2
Salary    1
dtype: int64


In [13]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.
import pandas as pd

# Create a sample DataFrame
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', None, 'David', 'Eve'],
    'Age': [25, None, 22, 30, None],
    'Salary': [50000, 60000, None, 70000, 80000]
}

df = pd.DataFrame(data)

# Step 1: Drop rows with any missing values
df_cleaned = df.dropna()

# Display the cleaned dataset
print("Dataset after dropping rows with missing values:")
print(df_cleaned)


Dataset after dropping rows with missing values:
   ID   Name   Age   Salary
0   1  Alice  25.0  50000.0
3   4  David  30.0  70000.0


In [14]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.
import pandas as pd

# Create a sample DataFrame
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', None, 'David', 'Eve'],
    'Age': [25, None, 22, 30, None],
    'Salary': [50000, 60000, None, 70000, 80000]
}

df = pd.DataFrame(data)

# Step 1: Drop columns with any missing values
df_cleaned = df.dropna(axis=1)

# Display the cleaned dataset
print("Dataset after dropping columns with missing values:")
print(df_cleaned)



Dataset after dropping columns with missing values:
   ID
0   1
1   2
2   3
3   4
4   5


In [15]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.

import pandas as pd

# Create a sample DataFrame with numerical data
data = {
    'ID': [1, 2, 3, 4, 5],
    'Age': [25, None, 22, 30, None],
    'Salary': [50000, 60000, None, 70000, 80000]
}

df = pd.DataFrame(data)

# Step 1: Calculate the mean of the 'Age' column (ignoring NaN values)
mean_age = df['Age'].mean()

# Step 2: Fill missing values in 'Age' with the calculated mean
df['Age'] = df['Age'].fillna(mean_age)

# Display the dataset after imputation
print("Dataset after mean imputation:")
print(df)


Dataset after mean imputation:
   ID        Age   Salary
0   1  25.000000  50000.0
1   2  25.666667  60000.0
2   3  22.000000      NaN
3   4  30.000000  70000.0
4   5  25.666667  80000.0


In [16]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.

import pandas as pd

# Create a sample DataFrame with categorical data
data = {
    'ID': [1, 2, 3, 4, 5],
    'Gender': ['Male', None, 'Female', 'Female', None],
    'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'New York']
}

df = pd.DataFrame(data)

# Step 1: Calculate the mode of the 'Gender' column (most frequent value)
mode_gender = df['Gender'].mode()[0]  # mode() returns a series, so we take the first value

# Step 2: Fill missing values in 'Gender' with the calculated mode
df['Gender'] = df['Gender'].fillna(mode_gender)

# Display the dataset after mode imputation
print("Dataset after mode imputation:")
print(df)


Dataset after mode imputation:
   ID  Gender         City
0   1    Male     New York
1   2  Female  Los Angeles
2   3  Female     New York
3   4  Female      Chicago
4   5  Female     New York


In [17]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.

import pandas as pd

# Create a sample DataFrame with numerical data (skewed distribution)
data = {
    'ID': [1, 2, 3, 4, 5],
    'Salary': [50000, 60000, None, 120000, None]  # Skewed data with missing values
}

df = pd.DataFrame(data)

# Step 1: Calculate the median of the 'Salary' column
median_salary = df['Salary'].median()

# Step 2: Fill missing values in 'Salary' with the calculated median
df['Salary'] = df['Salary'].fillna(median_salary)

# Display the dataset after median imputation
print("Dataset after median imputation:")
print(df)


Dataset after median imputation:
   ID    Salary
0   1   50000.0
1   2   60000.0
2   3   60000.0
3   4  120000.0
4   5   60000.0


In [18]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.

import pandas as pd
from sklearn.impute import KNNImputer

# Create a sample DataFrame with missing values
data = {
    'Age': [25, 30, 35, None, 45],
    'Income': [50000, 60000, None, 80000, 120000],
    'Experience': [2, 5, 10, 3, None]
}

df = pd.DataFrame(data)

# Initialize KNNImputer, set n_neighbors to the number of neighbors to consider
knn_imputer = KNNImputer(n_neighbors=2)

# Apply KNNImputer to the dataset
df_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# Display the dataset after KNN imputation
print("Dataset after KNN Imputation:")
print(df_imputed)


Dataset after KNN Imputation:
    Age    Income  Experience
0  25.0   50000.0         2.0
1  30.0   60000.0         5.0
2  35.0   70000.0        10.0
3  32.5   80000.0         3.0
4  45.0  120000.0         6.5


In [19]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.

import pandas as pd

# Sample dataset with categorical data
data = {
    'Category': ['A', 'B', None, 'A', 'C', 'B', None, 'A', 'C', 'B']
}

df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset with Missing Categorical Data:")
print(df)

# Task 1: Identify missing values in the 'Category' column
missing_values = df['Category'].isnull()
print("\nMissing Values in 'Category' column:")
print(missing_values)

# Task 2: Impute missing values with the next frequent category
# Find the mode (most frequent) of the 'Category' column
mode_value = df['Category'].mode()[0]  # Mode gives a series, so select the first one

# Use the mode to fill missing values
df['Category'] = df['Category'].fillna(mode_value)

# Display the updated dataset
print("\nDataset after Imputing Missing Categorical Data with Mode:")
print(df)


Original Dataset with Missing Categorical Data:
  Category
0        A
1        B
2     None
3        A
4        C
5        B
6     None
7        A
8        C
9        B

Missing Values in 'Category' column:
0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
9    False
Name: Category, dtype: bool

Dataset after Imputing Missing Categorical Data with Mode:
  Category
0        A
1        B
2        A
3        A
4        C
5        B
6        A
7        A
8        C
9        B


In [20]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample dataset with some missing values
data = {
    'Feature1': [1, 2, 3, 4, 5, None, 7, 8],
    'Feature2': [2, 3, 4, 5, 6, 7, 8, 9],
    'Feature3': [1, 2, 3, 4, 5, 6, 7, 8]
}

df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset with Missing Values:")
print(df)

# Step 1: Partition the data into train and test (based on missing values in Feature1)
train_data = df[df['Feature1'].notnull()]  # Rows without missing values in 'Feature1'
test_data = df[df['Feature1'].isnull()]    # Rows with missing values in 'Feature1'

# Features for training (excluding the target feature 'Feature1')
X_train = train_data[['Feature2', 'Feature3']]
y_train = train_data['Feature1']

# Features for testing (to predict missing 'Feature1' values)
X_test = test_data[['Feature2', 'Feature3']]

# Step 2: Train a model (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# Step 3: Predict missing values in 'Feature1'
predicted_values = model.predict(X_test)

# Fill the missing values in 'Feature1' with the predicted values
df.loc[df['Feature1'].isnull(), 'Feature1'] = predicted_values

# Display the updated dataset with imputed values
print("\nDataset after Imputing Missing Values using Predictive Modeling:")
print(df)



Original Dataset with Missing Values:
   Feature1  Feature2  Feature3
0       1.0         2         1
1       2.0         3         2
2       3.0         4         3
3       4.0         5         4
4       5.0         6         5
5       NaN         7         6
6       7.0         8         7
7       8.0         9         8

Dataset after Imputing Missing Values using Predictive Modeling:
   Feature1  Feature2  Feature3
0       1.0         2         1
1       2.0         3         2
2       3.0         4         3
3       4.0         5         4
4       5.0         6         5
5       6.0         7         6
6       7.0         8         7
7       8.0         9         8


In [21]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.

import pandas as pd

# Simulated time series dataset with missing values
data = {
    'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'],
    'Value': [10, None, 20, None, 30]
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Step 1: Sort the data by 'Date' (although it's already sorted in this example)
df = df.sort_values('Date')

# Display the original dataset with missing values
print("Original Time Series Dataset with Missing Values:")
print(df)

# Step 2: Use forward fill (ffill) to fill missing values
df_ffill = df.copy()  # Create a copy to apply forward fill
df_ffill['Value'] = df_ffill['Value'].fillna(method='ffill')

# Step 3: Use backward fill (bfill) to fill missing values
df_bfill = df.copy()  # Create a copy to apply backward fill
df_bfill['Value'] = df_bfill['Value'].fillna(method='bfill')

# Display the filled datasets
print("\nTime Series Dataset after Forward Fill (ffill):")
print(df_ffill)

print("\nTime Series Dataset after Backward Fill (bfill):")
print(df_bfill)


Original Time Series Dataset with Missing Values:
        Date  Value
0 2023-01-01   10.0
1 2023-01-02    NaN
2 2023-01-03   20.0
3 2023-01-04    NaN
4 2023-01-05   30.0

Time Series Dataset after Forward Fill (ffill):
        Date  Value
0 2023-01-01   10.0
1 2023-01-02   10.0
2 2023-01-03   20.0
3 2023-01-04   20.0
4 2023-01-05   30.0

Time Series Dataset after Backward Fill (bfill):
        Date  Value
0 2023-01-01   10.0
1 2023-01-02   20.0
2 2023-01-03   20.0
3 2023-01-04   30.0
4 2023-01-05   30.0


  df_ffill['Value'] = df_ffill['Value'].fillna(method='ffill')
  df_bfill['Value'] = df_bfill['Value'].fillna(method='bfill')
