# Data Cleaning and Preprocessing - Solutions

Handling missing values, type conversions, renaming, replacing values, and removing duplicates.

## Question 1
Check for missing values in the DataFrame using isna() and count the total number of missing values.

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, 7, 8], 'C': [9, 10, 11, np.nan]})
print("Missing values per column:")
print(df.isna().sum())
print(f"\nTotal missing values: {df.isna().sum().sum()}")

## Question 2
Fill missing values in column 'A' with the mean of that column using fillna().

In [None]:
df['A'] = df['A'].fillna(df['A'].mean())
print(df)

## Question 3
Drop all rows that contain any missing values using dropna().

In [None]:
df_clean = df.dropna()
print(df_clean)

## Question 4
Convert the data type of column 'B' from float to integer using astype().

In [None]:
df_clean = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5.0, 6.0, 7.0, 8.0]})
df_clean['B'] = df_clean['B'].astype(int)
print(df_clean.dtypes)
print(df_clean)

## Question 5
Rename columns 'A' and 'B' to 'First' and 'Second' respectively using rename().

In [None]:
df_renamed = df_clean.rename(columns={'A': 'First', 'B': 'Second'})
print(df_renamed)

## Question 6
Replace all occurrences of the value 5 with 50 in the entire DataFrame using replace().

In [None]:
df_values = pd.DataFrame({'X': [1, 5, 3], 'Y': [5, 2, 7], 'Z': [8, 5, 9]})
df_replaced = df_values.replace(5, 50)
print(df_replaced)

## Question 7
Create a DataFrame with duplicate rows and remove them using drop_duplicates().

In [None]:
df_duplicates = pd.DataFrame({'A': [1, 2, 2, 3], 'B': [4, 5, 5, 6]})
print("Original DataFrame:")
print(df_duplicates)
df_no_duplicates = df_duplicates.drop_duplicates()
print("\nAfter removing duplicates:")
print(df_no_duplicates)

## Question 8
Fill missing values with forward fill method using fillna(method='ffill').

In [None]:
df_ffill = pd.DataFrame({'A': [1, np.nan, 3, np.nan, 5]})
print("Original:")
print(df_ffill)
df_filled = df_ffill.fillna(method='ffill')
print("\nAfter forward fill:")
print(df_filled)

## Question 9
Check data types of all columns using dtypes and convert a string column to numeric using pd.to_numeric().

In [None]:
df_types = pd.DataFrame({'A': ['1', '2', '3'], 'B': [4, 5, 6]})
print("Original dtypes:")
print(df_types.dtypes)
df_types['A'] = pd.to_numeric(df_types['A'])
print("\nAfter conversion:")
print(df_types.dtypes)
print(df_types)

## Question 10
Replace missing values in different columns with different strategies: column 'A' with 0, column 'B' with the median.

In [None]:
df_multi_fill = pd.DataFrame({'A': [1, np.nan, 3, np.nan], 'B': [10, 20, np.nan, 40]})
print("Original:")
print(df_multi_fill)
df_multi_fill['A'] = df_multi_fill['A'].fillna(0)
df_multi_fill['B'] = df_multi_fill['B'].fillna(df_multi_fill['B'].median())
print("\nAfter filling:")
print(df_multi_fill)