### 1. Import Required Libraries
We start by importing pandas, which is used for data manipulation and analysis.

In [None]:
import pandas as pd

### 2. Load the Data
Read the CSV file into a pandas DataFrame. We use `low_memory=False` to avoid type warnings.

In [None]:
house_data = pd.read_csv("dataset/melb_data.csv", low_memory=False)
print("Step 1: Loaded data")
print("Shape:", house_data.shape)
print(house_data.head(), "\n")

### 3. Check and Remove Duplicate Rows
Find out how many duplicate rows exist and remove them.

In [None]:
num_duplicates = house_data.duplicated().sum()
print(f"Step 2: Number of duplicate rows: {num_duplicates}")
house_data = house_data.drop_duplicates()
print("Step 2: After removing duplicates")
print("Shape:", house_data.shape)
print(house_data.head(), "\n")

### 4. Check and Remove Negative Prices
If the 'Price' column exists, count and remove rows with negative or zero prices.

In [None]:
if 'Price' in house_data.columns:
    num_negative_prices = (house_data['Price'] <= 0).sum()
    print(f"Step 3: Number of rows with negative prices: {num_negative_prices}")
    house_data = house_data[house_data['Price'] > 0]
    print("Step 3: After removing negative prices")
    print("Shape:", house_data.shape)
    print(house_data.head(), "\n")

### 5. Check and Handle Missing Data
Count missing values in each column and fill them with median (numeric) or mode (categorical).

In [None]:
missing_counts = house_data.isnull().sum()
print("Step 4: Number of missing values in each column before filling:")
print(missing_counts[missing_counts > 0])
for column in house_data.columns:
    if house_data[column].dtype == 'O':  # 'O' for object
        mode_value = house_data[column].mode()[0]
        house_data[column] = house_data[column].fillna(mode_value)
    else:
        median_value = house_data[column].median()
        house_data[column] = house_data[column].fillna(median_value)
print("Step 4: After handling missing data")
print("Shape:", house_data.shape)
print(house_data.head(), "\n")

### 6. Check and Handle Categorical Data
Find categorical columns and encode them using one-hot encoding.

In [None]:
categorical_columns = house_data.select_dtypes(include=['object']).columns
print(f"Step 5: Number of categorical columns: {len(categorical_columns)}")
print("Categorical columns:", list(categorical_columns))
house_data = pd.get_dummies(house_data, columns=categorical_columns, drop_first=True)
print("Step 5: After encoding categorical data")
print("Shape:", house_data.shape)
print(house_data.head(), "\n")