In [6]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Department': ['HR', 'IT', 'HR', 'Finance', 'IT'],
    'Salary': [50000, None, 52000, 58000, None]
})

# Fill missing Salary based on mean salary per Department
df['Salary'] = df.groupby('Department')['Salary'].transform(lambda x: x.fillna(x.mean()))


In [2]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.

from scipy import stats

# Example numeric column
df = pd.DataFrame({'Score': [100, 110, 105, 120, 300, 115, 108]})

# Calculate z-scores and filter
df = df[(np.abs(stats.zscore(df['Score'])) < 3)]


In [3]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.

df = pd.DataFrame({'Age': [25, None, 30, None, 45]})

# Fill missing values (with median) and convert to int
df['Age'] = df['Age'].fillna(df['Age'].median()).astype(int)


In [4]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.

def clean_data(df):
    df = df.drop_duplicates()
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
    df = df.fillna(method='ffill')
    return df

# Example usage:
# df = clean_data(df)


In [5]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.

df = pd.DataFrame({'Marks': [50, 60, 70, 80, 90]})

# Min-max normalization
df['Marks_normalized'] = (df['Marks'] - df['Marks'].min()) / (df['Marks'].max() - df['Marks'].min())
