In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
train_df = pd.read_csv(r"../data/raw/train.csv")
eval_df = pd.read_csv(r"../data/raw/eval.csv")

In [None]:
train_df.head(1)

In [None]:
eval_df.head(1)

In [None]:
train_df.info()

In [None]:
train_df.duplicated().sum()

In [None]:
train_df.city_full.value_counts()


In [None]:
print(train_df.shape)
duplicated_rows = train_df[train_df.duplicated()].shape[0]
print("duplicated_rows:", duplicated_rows)

duplicated_rows = train_df[train_df.duplicated(subset=train_df.columns.difference(['date', 'year']))].shape[0]
print("duplicated_rows excluding date column:", duplicated_rows)

In [None]:
# Delete duplicates
train_df = train_df.drop_duplicates(subset=train_df.columns.difference(['date', 'year']), keep=False)

print(train_df.shape)

duplicated_rows = train_df[train_df.duplicated()].shape[0]
print("duplicated_rows:", duplicated_rows)

duplicated_rows = train_df[train_df.duplicated(subset=train_df.columns.difference(['date', 'year']))].shape[0]
print("duplicated_rows excluding date column:", duplicated_rows)

In [None]:
print(eval_df.shape)


duplicated_rows = eval_df[eval_df.duplicated()].shape[0]
print("duplicated_rows:", duplicated_rows)

duplicated_rows = eval_df[eval_df.duplicated(subset=eval_df.columns.difference(['date', 'year']))].shape[0]
print("duplicated_rows excluding date column:", duplicated_rows)

In [None]:
# Delete duplicates
eval_df = eval_df.drop_duplicates(subset=eval_df.columns.difference(['date', 'year']), keep=False)

print(eval_df.shape)


duplicated_rows = eval_df[eval_df.duplicated()].shape[0]
print("duplicated_rows:", duplicated_rows)

duplicated_rows = eval_df[eval_df.duplicated(subset=eval_df.columns.difference(['date', 'year']))].shape[0]
print("duplicated_rows excluding date column:", duplicated_rows)

In [None]:
train_df.drop('city_full', axis=1, inplace=True)
eval_df.drop('city_full', axis=1, inplace=True)

In [None]:
print(train_df.shape)
print(eval_df.shape)

In [None]:
def find_outliers_per_city(df, col):
    outlier_indices = []
    
    # Group by city so we compare apples to apples
    for city, group in df.groupby('city'):
        Q1 = group[col].quantile(0.25)
        Q3 = group[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        
        # Find rows in THIS city that are outliers
        local_outliers = group[(group[col] < lower_limit) | (group[col] > upper_limit)]
        outlier_indices.extend(local_outliers.index)
        
    return list(set(outlier_indices))

# usage
bad_indices = find_outliers_per_city(train_df, 'median_sale_price')
print(f"Found {len(bad_indices)} rows that are outliers relative to their specific city.")

In [None]:
# Distribution of house prices across the dataset 
df = train_df
sns.set_theme(style="ticks")
fig, ax = plt.subplots(figsize=(8, 6))
sns.histplot(df["price"].dropna(), bins=60, kde=True, color=sns.color_palette("rocket_r", 1)[0], ax=ax)
median_price = df["price"].median()
ax.axvline(median_price, ls="--", lw=1.2, color="black")
ax.set_xlabel("Price ($)")
ax.set_ylabel("Count")
sns.despine(trim=True)
ax.grid(False)
ax.ticklabel_format(axis="x", style="plain")
plt.show()

In [None]:
train_df.to_csv(r"../data/interim/train_1.csv", index=False)
eval_df.to_csv(r"../data/interim/eval_1.csv", index=False)