## 1: imports & MLflow setup 

In [None]:
import pandas as pd, mlflow, matplotlib.pyplot as plt, seaborn as sns, os, json, pathlib, numpy as np

mlflow.set_tracking_uri("http://localhost:5000")   # assumes launcher is running
EXPERIMENT_NAME = "Baghdad‑Housing‑EDA"
mlflow.set_experiment(EXPERIMENT_NAME)

## 2: load data

In [None]:
DATA_PATH = "../data/raw/baghdad_mansour_houses.csv"
df = pd.read_csv(DATA_PATH)
df.head()

### Cell 3: quick summary

In [None]:
row_count, col_count = df.shape
missing_pct = df.isna().mean().mean() * 100

# basic statistics
summary = df.describe(include="all").transpose()

display(summary)
print(f"Rows: {row_count} | Cols: {col_count} | Missing %.2f%%" % missing_pct)

## 3: Plotting to get more insights
### property types, category, condition, furnished bar plot

In [None]:
#set up the figure with 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
sns.set(style="whitegrid")

# Plot 1: Property Type
sns.countplot(data=df, x="type", order=df["type"].value_counts().index, ax=axes[0, 0], palette="Set2")
axes[0, 0].set_title("Property Type")
axes[0, 0].set_xlabel("")
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot 2: Category
sns.countplot(data=df, x="category", order=df["category"].value_counts().index, ax=axes[0, 1], palette="Set3")
axes[0, 1].set_title("Listing Category")
axes[0, 1].set_xlabel("")
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot 3: Condition
sns.countplot(data=df, x="condition", order=df["condition"].value_counts().index, ax=axes[1, 0], palette="Pastel1")
axes[1, 0].set_title("Condition of Property")
axes[1, 0].set_xlabel("")
axes[1, 0].tick_params(axis='x', rotation=45)

# Plot 4: Furnished
sns.countplot(data=df, x="furnished", order=df["furnished"].value_counts().index, ax=axes[1, 1], palette="coolwarm")
axes[1, 1].set_title("Furnished Status")
axes[1, 1].set_xlabel("")
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


### Histogram for bedrooms

In [None]:
# Set up a 2x2 plot for histogram distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
sns.set(style="whitegrid")

# Histogram 1: Bedrooms
sns.histplot(data=df, x="Bedrooms", bins=10, kde=True, ax=axes[0, 0], color="skyblue")
axes[0, 0].set_title("Distribution of Bedrooms")
axes[0, 0].set_xlabel("Number of Bedrooms")
axes[0, 0].set_ylabel("Frequency")

# Histogram 2: Bathrooms
sns.histplot(data=df, x="bathrooms", bins=10, kde=True, ax=axes[0, 1], color="lightgreen")
axes[0, 1].set_title("Distribution of Bathrooms")
axes[0, 1].set_xlabel("Number of Bathrooms")
axes[0, 1].set_ylabel("Frequency")

# Histogram 3: Rent Price USD
sns.histplot(data=df, x="rent_price_usd", bins=15, kde=True, ax=axes[1, 0], color="coral")
axes[1, 0].set_title("Distribution of Rent Price (USD)")
axes[1, 0].set_xlabel("USD Price")
axes[1, 0].set_ylabel("Frequency")

# Histogram 4: Rent Price IQD
sns.histplot(data=df, x="rent_price_iqdr", bins=15, kde=True, ax=axes[1, 1], color="plum")
axes[1, 1].set_title("Distribution of Rent Price (IQD)")
axes[1, 1].set_xlabel("IQD Price")
axes[1, 1].set_ylabel("Frequency")

# Layout
plt.tight_layout()
plt.show()

### Line plot 
#### Spot trends over time

In [None]:

type_df = filtered_df.dropna(subset=["type", "rent_price_usd"])

type_df["listing_date"] = pd.to_datetime(type_df["listing_date"])
type_df = type_df.set_index("listing_date")

weekly_by_type = (
    type_df.groupby("type")["rent_price_usd"]
    .resample("W")
    .mean()
    .reset_index()
)

plt.figure(figsize=(14, 6))
sns.lineplot(data=weekly_by_type, x="listing_date", y="rent_price_usd", hue="type", marker="o")
plt.title("Weekly Average Rent Price by Property Type")
plt.xlabel("Week")
plt.ylabel("Average Rent Price (USD)")
plt.legend(title="Property Type")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


### Heatmap
#### Check for multicollinearity and strong correlations

In [None]:
numeric_df = df.copy()
numeric_columns = ["Bedrooms", "bathrooms", "rent_price_usd", "rent_price_iqdr", "sqmt_street", "year_built"]

for col in numeric_columns:
    numeric_df[col] = pd.to_numeric(numeric_df[col], errors="coerce")

corr_matrix = numeric_df[numeric_columns].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5, fmt=".2f")
plt.title("Correlation Heatmap of Numeric Features")
plt.tight_layout()
plt.show()


### Pie Chart 
#### Top areas with most listings

In [None]:
location_df = df[["county", "neighbourhood"]].dropna()

top_counties = location_df["county"].value_counts().head(5)
top_neighbourhoods = location_df["neighbourhood"].value_counts().head(5)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].pie(top_counties, labels=top_counties.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel"))
axes[0].set_title("Top 5 Counties by Listing Count")

axes[1].pie(top_neighbourhoods, labels=top_neighbourhoods.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("Set3"))
axes[1].set_title("Top 5 Neighbourhoods by Listing Count")

plt.tight_layout()
plt.show()


### Scatter Plot
#### Understand relationships

In [None]:
scatter_df = df[["Bedrooms", "rent_price_usd", "sqmt_street"]].copy()
scatter_df["Bedrooms"] = pd.to_numeric(scatter_df["Bedrooms"], errors="coerce")
scatter_df["rent_price_usd"] = pd.to_numeric(scatter_df["rent_price_usd"], errors="coerce")
scatter_df["sqmt_street"] = pd.to_numeric(scatter_df["sqmt_street"], errors="coerce")

scatter_df = scatter_df.dropna(subset=["Bedrooms", "rent_price_usd", "sqmt_street"])

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

sns.scatterplot(data=scatter_df, x="Bedrooms", y="rent_price_usd", ax=axes[0], color="slateblue", alpha=0.6)
axes[0].set_title("Bedrooms vs. Rent Price (USD)")
axes[0].set_xlabel("Number of Bedrooms")
axes[0].set_ylabel("Rent Price (USD)")

sns.scatterplot(data=scatter_df, x="sqmt_street", y="rent_price_usd", ax=axes[1], color="tomato", alpha=0.6)
axes[1].set_title("Street Size (sqm) vs. Rent Price (USD)")
axes[1].set_xlabel("Street Size (sqm)")
axes[1].set_ylabel("Rent Price (USD)")

plt.tight_layout()
plt.show()


### Count Plot
#### Explore presence/absence of features

In [None]:
cols_to_plot = ["balcony", "master bedroom", "area_type", "view"]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
sns.set(style="whitegrid")

for i, col in enumerate(cols_to_plot):
    ax = axes[i // 2, i % 2]
    sns.countplot(data=df, x=col, order=df[col].value_counts().index, ax=ax, palette="Set2")
    ax.set_title(f"Count Plot: {col}")
    ax.set_xlabel("")
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


### Violin Plot
#### Visualize distributions per condition in more detail

In [None]:
violin_df = df[["rent_price_usd", "condition"]].copy()
violin_df["rent_price_usd"] = pd.to_numeric(violin_df["rent_price_usd"], errors="coerce")
violin_df = violin_df.dropna()

plt.figure(figsize=(10, 6))
sns.violinplot(data=violin_df, x="condition", y="rent_price_usd", palette="Set3", inner="box")
plt.title("Distribution of Rent Price (USD) by Condition")
plt.xlabel("Condition")
plt.ylabel("Rent Price (USD)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 4: Clean up
#### 1. columns clean up 
##### 1.a property cleanup

In [None]:
df["type"] = df["type"].str.lower()

type_counts = df["type"].value_counts(dropna=False)
type_counts

df["type"] = df["type"].replace({
    "building": "other",
    "3 floors apartment building": "other"
})

cleaned_type_counts = df["type"].value_counts(dropna=False)
cleaned_type_counts

##### 1.b category cleanup

In [None]:
df["category"] = df["category"].str.lower()

df["category"] = df["category"].replace({
    "sell": "sale",
    "buy": "sale"
})

cleaned_category_counts = df["category"].value_counts(dropna=False)
cleaned_category_counts


##### 1.c condition cleanup

In [None]:
df["condition"] = df["condition"].str.lower()

df["condition"] = df["condition"].replace({
    "excellent": "new",
    "lickable": "new",
    "under construction": "new",
    "average": "good",
    "medium": "good",
    "bad": "old",
    "good":"good"
})

cleaned_condition_counts = df["condition"].value_counts(dropna=False)
cleaned_condition_counts


##### 1.d furnished cleanup

In [None]:
df["furnished"] = df["furnished"].str.lower()

df["furnished"] = df["furnished"].replace({
    "yes": "fully",
    "2": pd.NA
})

cleaned_furnished_counts = df["furnished"].value_counts(dropna=False)
cleaned_furnished_counts


##### 1.e balcony cleanup

In [None]:
df["balcony"] = pd.to_numeric(df["balcony"], errors="coerce")

df["balcony"].dtype


##### 1.f view cleanup

In [None]:
df["view"] = df["view"].str.strip().str.lower()

view_replacements = {
    "shamaliye": "north",
    "shamalya": "north",
    "shamaliya": "north",
    "شمالية": "north",
    "shamaliye sharqiye": "north",
    "shamalya gharbiya": "north",

    "gharbia": "west",
    "gharbiye": "west",
    "غربية": "west",

    "sharqiye": "east",
    "sharqiya": "east",

    "naighbourhood": "neighbourhood",
    "neighnourhood": "neighbourhood",
    "internal neighbourhood": "neighbourhood"
}
view_replacements.update({
    "shamalya": "north",
    "shamaliye": "north",
    "shamlaiye": "north",
    "shamaliya gharbiya": "north",
    "shamaliye sharqiye": "north"
})

df["view"] = df["view"].replace(view_replacements)
cleaned_view_counts = df["view"].value_counts(dropna=False)
cleaned_view_counts


##### 1.g ID cleanup

In [None]:
df.drop(columns=["ID"], inplace=True)

df.insert(0, "ID", range(1, len(df) + 1))

df[["ID"]].head()


##### 1.h listing_date cleanup

In [None]:
df["listing_date"] = pd.to_datetime(df["listing_date"], errors="coerce")

df["listing_date"].dtype


##### 1.I bedrooms cleanup

In [None]:
df["Bedrooms"] = df["Bedrooms"].replace("studio", 0)

df["Bedrooms"] = pd.to_numeric(df["Bedrooms"], errors="coerce")

df["Bedrooms"].value_counts(dropna=False).sort_index()


##### 1.j master_bedroom drop

In [None]:
df.drop(columns=["master bedroom"], inplace=True)

"master bedroom" not in df.columns


##### 1.k floor_apartment cleanup

In [None]:
df["floor_apartment"] = df["floor_apartment"].replace("10 +", 10)

df["floor_apartment"] = pd.to_numeric(df["floor_apartment"], errors="coerce")

df["floor_apartment"].value_counts(dropna=False).sort_index()


##### 1.l neighbourhood cleanup

In [None]:
df["neighbourhood"] = df["neighbourhood"].str.strip().str.lower()

neighbourhood_replacements = {
    "beijeia": "beiji",
    "beijia": "beiji",
    "muhandeseen": "mohandeseen",
    "dawoodi 611": "dawoodi",
    "dawoodi. 611": "dawoodi",
    "masour city in beija": "mansour city",
    "mansour city complex": "mansour city"
}

df["neighbourhood"] = df["neighbourhood"].replace(neighbourhood_replacements)

df["neighbourhood"].value_counts(dropna=False)


##### 1.m county cleanup

In [None]:
df["county"] = "mansour"

df["county"].value_counts(dropna=False)


##### 1.n street drop

In [None]:
df.drop(columns=["street"], inplace=True)

"street" not in df.columns


##### 1.o frontage drop

In [None]:
df["wajiha"] = df["wajiha"].replace("-", pd.NA)
df["wajiha"] = pd.to_numeric(df["wajiha"], errors="coerce")

df.rename(columns={"wajiha": "frontage"}, inplace=True)

df["frontage"].value_counts(dropna=False).sort_index()


##### 1.p sqmt_street drop

In [None]:
df.drop(columns=["sqmt_street"], inplace=True)

"sqmt_street" not in df.columns


##### 1.q year_built drop

In [None]:
from datetime import datetime

current_year = 2024

year_replacements = {
    "1 - 5": current_year - 3,
    "10 - 19": current_year - 15,
    "20 + years": current_year - 20,
    "20 plus": current_year - 20,
    "20+": current_year - 20,
    "3 ago": current_year - 3,
    "3 y o": current_year - 3,
    "4 y o": current_year - 4,
    "5 y ago": current_year - 5,
    "6 - 9": current_year - 7,
    "6-9 y/.ago": current_year - 7
}

df["year_built"] = df["year_built"].astype(str).str.strip().str.lower()
df["year_built"] = df["year_built"].replace(year_replacements)

df["year_built"] = pd.to_numeric(df["year_built"], errors="coerce")

df["year_built"].value_counts(dropna=False).sort_index()


##### 1.q selling_price_per_meter_iqdr cleanup

In [None]:
df["selling_price_per_meter_iqdr"] = df["selling_price_per_meter_iqdr"].astype(str).str.replace(",", "", regex=False)

df["selling_price_per_meter_iqdr"] = pd.to_numeric(df["selling_price_per_meter_iqdr"], errors="coerce")

df.loc[df["selling_price_per_meter_iqdr"] > 100_000_000, "selling_price_per_meter_iqdr"] = (
    df["selling_price_per_meter_iqdr"] / 100
)
df["selling_price_per_meter_iqdr"].dropna().unique()[:10]


##### 1.q rent_price_iqdr cleanup

In [None]:
df["rent_price_iqdr"] = df["rent_price_iqdr"].astype(str).str.replace(",", "", regex=False)

df["rent_price_iqdr"] = pd.to_numeric(df["rent_price_iqdr"], errors="coerce")

df["rent_price_iqdr"].dropna().unique()[:10]


##### 1.q selling_price_per_meter_usd cleanup

In [None]:
df["selling_price_per_meter_usd"] = pd.to_numeric(df["selling_price_per_meter_usd"], errors="coerce")

df.loc[df["selling_price_per_meter_usd"] > 30000, "selling_price_per_meter_usd"] /= 100

df["selling_price_per_meter_usd"].dropna().unique()[:10]


##### 1.s area_type cleanup

In [None]:

df["area_type"] = df["area_type"].fillna("unknown")

df["area_type"].value_counts()


##### 1.s  drop columns

In [None]:
columns_to_drop = ["wajiha", "street", "sqmt_street"]
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

[col for col in columns_to_drop if col in df.columns]


### saving the cleaned dataset 
#### saving it in interim

In [None]:
cleaned_file_path = "/Users/zahra/Desktop/ChronusMLOps/data/interim/cleaned_baghdad_mansour_houses.csv"

import os
os.makedirs("/Users/zahra/Desktop/ChronusMLOps/data/interim", exist_ok=True)
df.to_csv(cleaned_file_path, index=False)

cleaned_file_path

#### splitting it for rent and sale 

In [None]:
df = pd.read_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/cleaned_baghdad_mansour_houses.csv")

df_rent_v1 = df[df["category"] == "rent"].copy()
df_sale_v1 = df[df["category"] == "sale"].copy()

os.makedirs("/Users/zahra/Desktop/ChronusMLOps/data/interim", exist_ok=True)
df_rent_v1.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v1.csv", index=False)
df_sale_v1.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/sale_v1.csv", index=False)

"/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v1.csv", "/Users/zahra/Desktop/ChronusMLOps/data/interim/sale_v1.csv"


#### converting rent to usd 

In [4]:
import pandas as pd
import os

df = pd.read_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/cleaned_baghdad_mansour_houses.csv")

df_rent_v1 = df[df["category"] == "rent"].copy()
df_sale_v1 = df[df["category"] == "sale"].copy()

os.makedirs("/Users/zahra/Desktop/ChronusMLOps/data/interim", exist_ok=True)
df_rent_v1.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v1.csv", index=False)
df_sale_v1.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/sale_v1.csv", index=False)

df_rent_v1["final_rent_price_usd"] = df_rent_v1["rent_price_usd"].fillna(
    df_rent_v1["rent_price_iqdr"] / 1500
)

df_rent_v1.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv", index=False)


In [None]:
import pandas as pd

# Load the rent_v2 dataset
df_rent_v2 = pd.read_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv")

# 1. Display sample of the column
print("\n🎯 Sample Values:")
print(df_rent_v2["final_rent_price_usd"].dropna().head(10))

# 2. Count missing values
print("\n❗ Missing Values:", df_rent_v2["final_rent_price_usd"].isna().sum())

# 3. Show distribution summary
print("\n📊 Price Summary:")
print(df_rent_v2["final_rent_price_usd"].describe())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the rent_v2 dataset
df_rent_v2 = pd.read_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv")

# Set up the plot
plt.figure(figsize=(12, 5))

# 1. Distribution plot
plt.subplot(1, 2, 1)
sns.histplot(df_rent_v2["final_rent_price_usd"].dropna(), bins=30, kde=True, color="skyblue")
plt.title("Distribution of Final Rent Price (USD)")
plt.xlabel("Price (USD)")
plt.ylabel("Frequency")

# 2. Boxplot for outliers
plt.subplot(1, 2, 2)
sns.boxplot(x=df_rent_v2["final_rent_price_usd"], color="salmon")
plt.title("Outlier Detection (Boxplot)")
plt.xlabel("Price (USD)")

plt.tight_layout()
plt.show()


In [None]:
# Drop the specified sale-related columns from rent_v2
df_rent_v2 = df_rent_v2.drop(columns=["selling_price_per_meter_usd", "selling_price_per_meter_iqdr"], errors="ignore")

# Save back to the same file path
df_rent_v2.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv", index=False)

"/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv"


In [None]:
# Drop the specified sale-related columns from rent_v2
df_rent_v2 = df_rent_v2.drop(columns=["rent_price_iqdr", "rent_price_usd "], errors="ignore")

# Save back to the same file path
df_rent_v2.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv", index=False)

"/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv"


In [12]:
# Load the re-uploaded rent_v2 file
df_rent_v2 = pd.read_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v2.csv")

# Drop the specified columns
columns_to_drop = ["rent_price_usd", "rent_price_iqdr", "balcony", "neighbourhood", "frontage"]
df_rent_v3 = df_rent_v2.drop(columns=[col for col in columns_to_drop if col in df_rent_v2.columns], errors="ignore")

# Save the new version as rent_v3.csv
df_rent_v3.to_csv("/Users/zahra/Desktop/ChronusMLOps/data/interim/rent_v3.csv", index=False)


Simple Data Cleaning for rent_v4.csv
This cell:
• Drops useless columns
• Fills missing numerics with mean
• Fills missing categoricals with mode
• One-hot encodes categoricals
• Saves cleaned dataset as rent_v4.csv

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np

# 📂 Load raw dataset
df = pd.read_csv("../data/interim/rent_v3.csv")

# 🚮 Drop useless columns
cols_to_drop = ["ID", "county", "area_type"]
df = df.drop(columns=cols_to_drop, errors="ignore")

# 🔧 Fill missing numeric columns with mean
numeric_cols = df.select_dtypes(include="number").columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# 🔧 Fill missing categorical columns with mode
categorical_cols = df.select_dtypes(include="object").columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# 🏷️ One-hot encode categoricals
df = pd.get_dummies(df, drop_first=True)

# 💾 Save cleaned dataset
CLEANED_PATH = "../data/interim/rent_v4.csv"
df.to_csv(CLEANED_PATH, index=False)

print(f"✅ rent_v4.csv saved at {CLEANED_PATH} — {df.shape[0]} rows, {df.shape[1]} features.")


## 5: Outlier Removal – Clean rent_v4.csv to rent_v5.csv

- Removed top 1% and bottom 1% extreme values in `final_rent_price_usd`
- Removed extreme large properties in `sqmt_living`
- Saved the result as `rent_v5.csv`


In [None]:
df = pd.read_csv("../data/interim/rent_v4.csv")

lower_price = df["final_rent_price_usd"].quantile(0.01)
upper_price = df["final_rent_price_usd"].quantile(0.99)
df = df[(df["final_rent_price_usd"] >= lower_price) & (df["final_rent_price_usd"] <= upper_price)]

upper_sqmt = df["sqmt_living"].quantile(0.99)
df = df[df["sqmt_living"] <= upper_sqmt]

CLEANED_PATH = "../data/interim/rent_v5.csv"
df.to_csv(CLEANED_PATH, index=False)

print(f"rent_v5.csv saved — {df.shape[0]} rows, {df.shape[1]} features after outlier removal.")



- Dropped columns:
  - `floor_apartment`
  - `listing_date`
  - `condition`
  - `view`
- Goal: Reduce noise and instability before final modeling
- Saved cleaned dataset as `rent_v6.csv`


In [None]:
df = pd.read_csv("../data/interim/rent_v5.csv")

date_cols = [col for col in df.columns if col.startswith("listing_date_")]
df = df.drop(columns=date_cols)

print(f"Dropped {len(date_cols)} listing_date columns")


## 6: Extra Data Cleaning – rent_v7.csv

- Started from `rent_v5.csv` (after outlier removal)
- Dropped all `listing_date_*` one-hot columns
- Goal: Simplify input features for cleaner modeling and easier deployment
- Saved cleaned dataset as `rent_v7.csv`


In [None]:
df = pd.read_csv("../data/interim/rent_v5.csv")

date_cols = [col for col in df.columns if col.startswith("listing_date_")]
df = df.drop(columns=date_cols)

print(f"Dropped {len(date_cols)} listing_date columns.")

CLEANED_PATH = "../data/interim/rent_v7.csv"
df.to_csv(CLEANED_PATH, index=False)

print(f"rent_v7.csv saved at {CLEANED_PATH} — {df.shape[0]} rows, {df.shape[1]} features now.")
