# 🏠 Mini-Project: Preprocess & Engineer Features on Ames Housing Dataset

> **Goal: Work with the [Ames Housing dataset](https://www.kaggle.com/datasets/prevek18/ames-housing-dataset?select=AmesHousing.csv) to perform data preprocessing and create meaningful new features. You will:**
> - Handle **missing values**, **duplicates**, and **outliers**  
> - Detect and fix **skewness** in numerical features  
> - Encode categorical variables into numeric formats  
> - Create **non-linear features** (e.g., polynomial, log, interaction terms) from existing variables  
> - Save the cleaned and enriched dataset into a new CSV file  

<p align="center">📢⚠️📂</p>

<p align="center"> Please name your file using the format: <code>assignmentName_nickname.py/.ipynb</code> (e.g., <code>project2_rezashokrzad.py</code>) and push it to GitHub with a clear commit message.</p>

<p align="center"> 🚨📝🧠</p>


## 🔹 Step 1: Load the Dataset


In [None]:
# TODO: Load the Ames Housing dataset into a DataFrame.
# Hint: The dataset is available on Kaggle ("Ames Housing").
# After loading, display the first and last 5 rows to check if it worked.
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "AmesHousing.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "shashanknecrothapa/ames-housing-dataset",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)


df.head(5)

## 🔹 Step 2: Exploratory Data Review (EDR)

In [None]:
# TODO: Perform initial exploration of the dataset.
# - Check shape, column names, smaples
# - Get summary info, data types
# - Descriptive statistics

df.shape


In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.sample(5)

In [None]:
df.describe()

In [None]:
df.nunique()

#⭕ step 2': remove duplicates



In [None]:
# TODO: Check and remove duplicate rows if there is.

df.duplicated().sum()

## 🔹 Step 3: Missing Value Check & Handling

In [None]:
# TODO: Check missing values.
# Decide on a strategy (if needed):
# - Drop if too many are missing
# - Fill with mean/median/mode/domain-specific value
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
df.isnull().sum()
df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)

In [None]:
#drop na
df.dropna(thresh = 70, inplace=True)
df.shape

In [None]:
for col in df.select_dtypes(include=["float64", "int64"]).columns:   #for numerics fill with median
    df[col].fillna(df[col].median(), inplace=True)


for col in df.select_dtypes(include=["object"]).columns: # for objects fill with mode
    df[col].fillna(df[col].mode()[0], inplace=True)




In [None]:
df.isnull().sum().sum()

In [None]:
df.info()

In [None]:

# تابع بررسی ستون
def is_numeric_but_object(series):
    if series.dtype == 'object':
        converted = pd.to_numeric(series, errors='coerce')
        non_na_ratio = converted.notna().sum() / len(series)
        return non_na_ratio == 1  # 100% numeric
    else:
        return False  # خودش numeric نیست object که نباشه

# لیست ستون‌هایی که object هستن ولی numeric
numeric_like_objects = []

for col in df.columns:
    if is_numeric_but_object(df[col]):
        numeric_like_objects.append(col)

print("Columns that are object but fully numeric:", numeric_like_objects)

## 🔹 Step 4: Correlation Check & Feature Decision

In [None]:
# TODO: Check correlations between numerical features and target variable (SalePrice).
# Use correlation heatmap or pairplot.
# Decide which features to keep/remove based on correlation.

df.corr(numeric_only=True)

In [None]:
# فقط correlation هر ستون با target
cor_target = df.corr(numeric_only=True)['SalePrice'].sort_values(ascending=False)

plt.figure(figsize=(6, 12))  # اندازه مناسب برای یک ستون
sns.heatmap(cor_target.to_frame(), annot=True, cmap='coolwarm')  # تبدیل Series به DataFrame
plt.title("Correlation of Features with SalePrice")
plt.show()

In [None]:
df.shape
df.tail()

## 🔹 Step 5: Encode Categorical Variables

In [None]:
num_cols = df.select_dtypes(exclude=['object']).columns
cat_cols = df.select_dtypes(include=['object']).columns


In [None]:
# TODO: Identify categorical variables.
# Use methods like:
# - One-hot encoding
# - Ordinal encoding
# Decide what makes sense for each feature.

ordinal_features = [
    "Lot Shape",
    "Land Slope",
    "Exter Qual",
    "Exter Cond",
    "Bsmt Qual",
    "Bsmt Cond",
    "Bsmt Exposure",
    "BsmtFin Type 1",
    "BsmtFin Type 2",
    "Heating QC",
    "Kitchen Qual",
    "Functional",
    "Fireplace Qu",
    "Garage Finish",
    "Garage Qual",
    "Garage Cond",
    "Paved Drive",
    "Pool QC",
    "Fence"
]
nominal_features = [
    "MS Zoning",
    "Street",
    "Alley",
    "Land Contour",
    "Utilities",
    "Lot Config",
    "Neighborhood",
    "Condition 1",
    "Condition 2",
    "Bldg Type",
    "House Style",
    "Roof Style",
    "Roof Matl",
    "Exterior 1st",
    "Exterior 2nd",
    "Mas Vnr Type",
    "Foundation",
    "Heating",
    "Central Air",
    "Electrical",
    "Garage Type",
    "Misc Feature",
    "Sale Type",
    "Sale Condition"
]



In [None]:
oe = OrdinalEncoder()

'''


df_nominal[ordinal_features] = oe.fit_transform( df_nominal[ordinal_features])

#example test :
df_nominal["Lot Shape"].head()
'''

# moshkel injast ke bar asase alphabet chide nashode va encoder bar asase tartib encode nemishe pas dasti map mikonim:


ordinal_mapping = {
    "Lot Shape": {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3},
    "Land Slope": {'Sev': 0, 'Mod': 1, 'Gtl': 2},
    "Exter Qual": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Exter Cond": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Bsmt Qual": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Bsmt Cond": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Bsmt Exposure": {'No': 0, 'Mn': 1, 'Av': 2, 'Gd': 3},
    "BsmtFin Type 1": {'Unf': 0, 'LwQ': 1, 'BLQ': 2, 'Rec': 3, 'ALQ': 4, 'GLQ': 5},
    "BsmtFin Type 2": {'Unf': 0, 'LwQ': 1, 'BLQ': 2, 'Rec': 3, 'GLQ': 4, 'ALQ': 5},
    "Heating QC": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Kitchen Qual": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Functional": {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Min2': 4, 'Min1': 5, 'Mod': 6, 'Typ': 7},
    "Fireplace Qu": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Garage Finish": {'Unf': 0, 'RFn': 1, 'Fin': 2},
    "Garage Qual": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Garage Cond": {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    "Paved Drive": {'N': 0, 'P': 1, 'Y': 2},
    "Pool QC": {'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3},
    "Fence": {'MnWw': 0, 'MnPrv': 1, 'GdWo': 2, 'GdPrv': 3}
}


# اعمال mapping روی df_nominal
for col, mapping in ordinal_mapping.items():
    df[col] = df[col].map(mapping)

# حالا df_nominal خودش تغییر کرده
df["Lot Shape"].head()
df.tail()

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

encoded = ohe.fit_transform(df[nominal_features])
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(nominal_features),index=df.index ) # bedune index= moshkel NaN dashtim bade .tail
df = pd.concat([df.drop(columns=nominal_features), encoded_df], axis=1)

df.tail()

In [None]:
df.info()

In [None]:
df.shape


## 🔹 Step 6:  Feature Scaling

In [None]:
# TODO: Try different scaling techniques:
# - StandardScaler
# - MinMaxScaler
# - RobustScaler
# Decide based on the distribution of features.
rs = RobustScaler()
df[num_cols] = rs.fit_transform(df[num_cols])


## 🔹 Step 7: Feature Selection & Feature Creation 💡

In [None]:
# TODO: Select the most useful features.
# Try:
# - Correlation thresholding and Removing highly collinear features
# - decide yourself for dropping useless ones



In [None]:
# Filtering features (selecting)
target = 'SalePrice'

# فقط ستون‌های numeric
numeric_df = df.select_dtypes(include=['int64', 'float64'])
numeric_cols = numeric_df.columns.tolist()

# correlation با target
cor_target = numeric_df.corr()[target].abs()

# حذف ستون‌هایی که |corr| < 0.1
relevant_features = cor_target[cor_target >= 0.1].index.tolist()
df_filtered = df[relevant_features]

# multicollinear check
corr_matrix = df_filtered.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_cleaned = df_filtered.drop(columns=to_drop)

# فقط numeric هایی که حذف شدن به خاطر low correlation
numeric_removed = list(set(numeric_cols) - set(relevant_features))

print("Numeric columns removed due to low correlation with target:", numeric_removed)
print("Columns removed due to multicollinearity:", to_drop)
print("Remaining columns:", df_cleaned.columns.tolist())


In [None]:
# فقط correlation هر ستون با target
cor_target = df_cleaned.corr(numeric_only=True)['SalePrice'].sort_values(ascending=False)

plt.figure(figsize=(6, 12))  # اندازه مناسب برای یک ستون
sns.heatmap(cor_target.to_frame(), annot=True, cmap='coolwarm')  # تبدیل Series به DataFrame
plt.title("Correlation of Features with SalePrice")
plt.show()


In [None]:
print(df.columns.tolist())


In [None]:
# TODO: Create at least 2 NEW features.
# Examples:
# - Age of house: df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
# - Interaction: df["Quality_x_Size"] = df["OverallQual"] * df["GrLivArea"]
# - Non-linear: df["Log_LotArea"] = np.log1p(df["LotArea"])
import numpy as np

# 1. ویژگی‌های اصلی استاد
df["HouseAge"] = df["Yr Sold"] - df["Year Built"]
df["Quality_x_Size"] = df["Overall Qual"] * df["Gr Liv Area"]
df["Log_LotArea"] = np.where(
    ~df["Lot Area"].isna(),
    np.log1p(df["Lot Area"]),
    0
)

# 2. ویژگی‌های ترکیبی
df["TotalSF"] = df["Total Bsmt SF"] + df["1st Flr SF"] + df["2nd Flr SF"]
df["TotalPorchSF"] = (df["Open Porch SF"] + df["Enclosed Porch"] +
                      df["3Ssn Porch"] + df["Screen Porch"])
df["TotalBathrooms"] = (df["Full Bath"] + 0.5 * df["Half Bath"] +
                         df["Bsmt Full Bath"] + 0.5 * df["Bsmt Half Bath"])
df["TotalRooms"] = df["TotRms AbvGrd"] + df.get("BsmtRooms", 0)  # ممکنه ستون BsmtRooms نباشه
df["TotalOutdoorSF"] = df.get("Wood Deck SF", 0) + df["TotalPorchSF"] + df.get("Pool Area", 0)

# 3. نسبت‌ها (Ratios)
df["GrLivArea_per_Room"] = np.where(
    (df["TotRms AbvGrd"] > 0) & (~df["TotRms AbvGrd"].isna()),
    df["Gr Liv Area"] / df["TotRms AbvGrd"],
    0
)

df["GarageArea_per_Car"] = np.where(
    (df["Garage Cars"] > 0) & (~df["Garage Cars"].isna()),
    df["Garage Area"] / df["Garage Cars"],
    0
)

df["Bath_per_Bedroom"] = np.where(
    (df["Bedroom AbvGr"] > 0) & (~df["Bedroom AbvGr"].isna()),
    df["TotalBathrooms"] / df["Bedroom AbvGr"],
    0
)

df["LotArea_per_GrLivArea"] = df["Lot Area"] / df["Gr Liv Area"]
df["Year_since_remod"] = df["Yr Sold"] - df["Year Remod/Add"]

# 4. ویژگی‌های تعامل (Interactions)
df["OverallQual_x_TotalSF"] = df["Overall Qual"] * df["TotalSF"]
df["OverallQual_x_GrLivArea"] = df["Overall Qual"] * df["Gr Liv Area"]

# 5. ویژگی‌های زمانی / بولین
df["Since_Remodel"] = df["Yr Sold"] - df["Year Remod/Add"]
df["Is_Remodeled"] = (df["Year Built"] != df["Year Remod/Add"]).astype(int)
df["Has_Pool"] = (df.get("Pool Area", 0) > 0).astype(int)
df["Has_Garage"] = ((df.get("Garage Area", 0) > 0) | (df.get("Garage Cars", 0) > 0)).astype(int)
df["Has_Basement"] = (df.get("Total Bsmt SF", 0) > 0).astype(int)
df["Has_Fireplace"] = (df.get("Fireplaces", 0) > 0).astype(int)

# 6. کیفیت ترکیبی آشپزخانه
if {"Kitchen Qual", "Kitchen Cond"}.issubset(df.columns):
    df["Kitchen_Score"] = (df["Kitchen Qual"] + df["Kitchen Cond"]) / 2

# 7. تمیزکاری مقادیر بی‌نهایت
df.replace([np.inf, -np.inf], np.nan, inplace=True)

df.isna().sum()



# 🫕 to the creation.. we have Nan .. now imputing

In [None]:
#nan :
# Log_LotArea	341
#LotArea_per_GrLivArea	3
# Imputation با Median
df["Log_LotArea"].fillna(df["Log_LotArea"].median(), inplace=True)
df["LotArea_per_GrLivArea"].fillna(df["LotArea_per_GrLivArea"].median(), inplace=True)

df.isna().sum().sum()

## 🔹 Step 8: Outlier Handling

In [None]:
# TODO: Detect and handle outliers.
# Methods:
# - IQR rule
# - Z-score
# - Visualization (boxplots, scatterplots)




In [None]:
# IQR

# انتخاب ستون های عددی
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1

outliers = ((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
# 1.5 standarde ama baraye taghire sensivity mishe avaz kard

print(f'Number of outliers detected by boxplot method: {outliers.sum()}')
print(f'Percentage of outliers: {outliers.sum()/len(df)*100:.2f}%')

df[outliers].shape

In [None]:
z_score = np.abs((df[num_cols] - df[num_cols].mean())/ df[num_cols].std())
outliers = (z_score > 1.5).any(axis=1) #inja all hame feature haro barresi mikone va agar hame out budan True (100% )
df[outliers].shape
#df[~outliers].shape #unai ke outlier nistan


In [None]:
#boxplot;
# انتخاب ستون‌های عددی
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

batch_size = 5
for i in range(0, len(numeric_cols), batch_size):
    cols_batch = numeric_cols[i:i+batch_size]
    df_batch = df[cols_batch]

    plt.figure(figsize=(12, len(cols_batch) * 1.5))
    for j, col in enumerate(cols_batch, 1):
        plt.subplot(len(cols_batch), 1, j)
        sns.boxplot(x=df[col], color="skyblue")
        plt.title(f"Boxplot - {col}")
    plt.tight_layout()
    plt.show()


In [None]:
#scatterplot

# -- تنظیمات اصلی --
target_col = 'SalePrice'   # اسم ستون تارگت
batch_size = 5             # تعداد نمودارها در هر سری

# فقط ستون‌های عددی غیر تارگت
numeric_cols = [col for col in df.select_dtypes(include=np.number).columns if col != target_col]

# مرتب‌سازی: اول حروف، بعد اعداد
numeric_cols = sorted(numeric_cols, key=lambda x: (str(x)[0].isdigit(), str(x).lower()))

# رسم Scatterplot به صورت Batch
for i in range(0, len(numeric_cols), batch_size):
    cols_batch = numeric_cols[i:i+batch_size]

    plt.figure(figsize=(12, len(cols_batch) * 3))
    for j, col in enumerate(cols_batch, 1):
        plt.subplot(len(cols_batch), 1, j)
        sns.scatterplot(x=df[col], y=df[target_col], alpha=0.6, color='teal', edgecolor=None)
        plt.title(f"Scatterplot: {col} vs {target_col}")
        plt.xlabel(col)
        plt.ylabel(target_col)
    plt.tight_layout()
    plt.show()

## 🔹 Step 9: Skewness Handling

In [None]:
# TODO: Check skewness of numerical features.
# Apply log, sqrt, Box-Cox, or Yeo-Johnson depending on distribution.

from scipy.stats import boxcox, yeojohnson

# 1️⃣ ستون‌های numeric واقعی
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 2️⃣ بررسی skewness قبل از transformation
skew_values = df[numeric_cols].skew()
print("Skewness before transformation:\n", skew_values)

# 3️⃣ Threshold برای skewed بودن
skew_threshold = 0.75

# 4️⃣ کپی dataframe برای اعمال transformation
df_transformed = df.copy()

# 5️⃣ اجرای transformation بر اساس توزیع
for col in numeric_cols:
    skew_val = skew_values[col]

    if abs(skew_val) > skew_threshold:
        # Positive and > 0 -> Box-Cox
        if (df_transformed[col] > 0).all():
            df_transformed[col], _ = boxcox(df_transformed[col])
            print(f"Applied Box-Cox on {col}")
        # اگر صفر یا منفی داره -> Yeo-Johnson
        elif (df_transformed[col] <= 0).any():
            df_transformed[col], _ = yeojohnson(df_transformed[col])
            print(f"Applied Yeo-Johnson on {col}")
          #Optional: log/sqrt برای skew خیلی شدید (می‌تونی فعال کنی)
        elif skew_val > 2:
           df_transformed[col] = np.log1p(df_transformed[col])
        elif skew_val < -2:
            df_transformed[col] = np.sqrt(df_transformed[col].max() - df_transformed[col])

# 6️⃣ بررسی skewness بعد از transformation
new_skew = df_transformed[numeric_cols].skew()
print("Skewness after transformation:\n", new_skew)

# 7️⃣ df_transformed آماده است برای scaling و مدل‌سازی

In [None]:
# لیست featureهایی که میخوای drop کنی
drop_cols = [
    'Has_Pool', 'Has_Shed', 'Garage_Carport',
    'OverallQual_x_GrLivArea', 'TotalBsmtSF_per_Room', 'Age_House',
    'TotRmsAbvGrd', 'GarageCars', 'LotFrontage'
]

# چک کن که فقط ستون‌های موجود drop بشن
drop_cols_existing = [col for col in drop_cols if col in df_transformed.columns]

# drop کردن از df
df_transformed = df_transformed.drop(columns=drop_cols_existing)

# تایید تعداد ستون‌ها بعد از drop
print(f"New shape of df: {df_transformed.shape}")

In [None]:
# ---------- Step 1: Calculate skewness ----------
skewness = df_transformed.skew()

# ---------- Step 2: Calculate outlier percentage ----------
Q1 = df_transformed.quantile(0.25)
Q3 = df_transformed.quantile(0.75)
IQR = Q3 - Q1

outlier_mask = ((df_transformed < (Q1 - 1.5 * IQR)) | (df_transformed > (Q3 + 1.5 * IQR)))
outlier_percent = outlier_mask.sum() / len(df) * 100

# ---------- Step 3: Classify features ----------
drop_features = []
clip_features = []

for col in df_transformed.columns:
    if col in skewness.index:  # فقط برای عددی‌ها
        if abs(skewness[col]) > 1 and outlier_percent[col] > 20:
            drop_features.append(col)
        elif outlier_percent[col] > 10:
            clip_features.append(col)

print("Features to DROP (too skewed + high outliers):", drop_features)
print("Features to CLIP (moderate skew, outlier handling):", clip_features)

# ---------- Step 4: Apply clipping ----------
df_clean = df_transformed.copy()
for col in clip_features:
    lower = Q1[col] - 1.5 * IQR[col]
    upper = Q3[col] + 1.5 * IQR[col]
    df_clean[col] = df_clean[col].clip(lower=lower, upper=upper)

In [None]:
# حذف فیچرهای غیر مفید
df_clean = df_transformed.drop(columns=['MS Zoning_RL', 'Roof Style_Gable'])

# کلـیپ برای فیچرهای دیگه
clip_features = [
    'Exter Cond', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Fireplace Qu',
    'Enclosed Porch', 'MS Zoning_RM', 'Land Contour_Lvl', 'Lot Config_Corner',
    'Neighborhood_NAmes', 'Condition 1_Norm', 'Bldg Type_1Fam',
    'House Style_1.5Fin', 'Roof Style_Hip', 'Exterior 1st_HdBoard',
    'Exterior 1st_MetalSd', 'Exterior 1st_Wd Sdng', 'Exterior 2nd_HdBoard',
    'Exterior 2nd_MetalSd', 'Exterior 2nd_Wd Sdng', 'Foundation_BrkTil',
    'Sale Type_WD ', 'Sale Condition_Normal', 'GarageArea_per_Car',
    'Bath_per_Bedroom', 'LotArea_per_GrLivArea', 'OverallQual_x_TotalSF'
]

Q1 = df_clean[clip_features].quantile(0.25)
Q3 = df_clean[clip_features].quantile(0.75)
IQR = Q3 - Q1

for col in clip_features:
    lower = Q1[col] - 1.5 * IQR[col]
    upper = Q3[col] + 1.5 * IQR[col]
    df_clean[col] = df_clean[col].clip(lower=lower, upper=upper)

# بررسی نهایی
print("Final shape:", df_clean.shape)

In [None]:
# IQR

# انتخاب ستون های عددی
Q1 = df_clean[num_cols].quantile(0.25)
Q3 = df_clean[num_cols].quantile(0.75)
IQR = Q3 - Q1

outliers = ((df_clean[num_cols] < (Q1 - 1.5 * IQR)) | (df_clean[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
# 1.5 standarde ama baraye taghire sensivity mishe avaz kard

print(f'Number of outliers detected by boxplot method: {outliers.sum()}')
print(f'Percentage of outliers: {outliers.sum()/len(df)*100:.2f}%')

df_clean[outliers].shape

In [None]:
from scipy.stats import yeojohnson, boxcox
from scipy.special import boxcox1p

# بررسی skewness دوباره
skewness = df_clean.skew().sort_values(ascending=False)

# انتخاب فقط فیچرهایی که skew بالا دارن
skewed_features = skewness[abs(skewness) > 0.75].index

print("Number of skewed features before re-transform:", len(skewed_features))

# اعمال ترنسفورم مناسب
for col in skewed_features:
    if (df_clean[col] <= 0).any():
        # اگر صفر یا منفی داره → Yeo-Johnson
        df_clean[col], _ = yeojohnson(df_clean[col])
    else:
        # فقط مثبت → Box-Cox
        df_clean[col] = boxcox1p(df_clean[col], 0.15)

# چک مجدد skewness
print("Skewness after re-transform:")
print(df_clean[skewed_features].skew().sort_values(ascending=False))

In [None]:
import pandas as pd
import numpy as np

# --- لیست فیچرها ---
features_to_drop = ['MS Zoning_RL', 'Roof Style_Gable']  # خیلی skewed + high outliers
features_to_clip = [
    'Exter Cond', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Fireplace Qu', 'Enclosed Porch',
    'MS Zoning_RM', 'Land Contour_Lvl', 'Lot Config_Corner', 'Neighborhood_NAmes',
    'Condition 1_Norm', 'Bldg Type_1Fam', 'House Style_1.5Fin', 'Roof Style_Hip',
    'Exterior 1st_HdBoard', 'Exterior 1st_MetalSd', 'Exterior 1st_Wd Sdng',
    'Exterior 2nd_HdBoard', 'Exterior 2nd_MetalSd', 'Exterior 2nd_Wd Sdng',
    'Foundation_BrkTil', 'Sale Type_WD ', 'Sale Condition_Normal', 'GarageArea_per_Car',
    'Bath_per_Bedroom', 'LotArea_per_GrLivArea', 'OverallQual_x_TotalSF'
]

# --- اعمال drop ---
df_clean_dropped = df_clean.drop(columns=features_to_drop, errors='ignore')

# --- اعمال clip ---
for col in features_to_clip:
    if col in df_clean_dropped.columns:
        lower = df_clean_dropped[col].quantile(0.01)
        upper = df_clean_dropped[col].quantile(0.99)
        df_clean_dropped[col] = df_clean_dropped[col].clip(lower, upper)

# --- بررسی outlierها دوباره ---
def outlier_stats(df):
    outlier_counts = []
    for col in df.select_dtypes(include=np.number).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5*iqr
        upper = q3 + 1.5*iqr
        outliers = df[(df[col] < lower) | (df[col] > upper)]
        outlier_counts.append(len(outliers))
    total_outliers = sum(outlier_counts)
    perc_outliers = total_outliers / (df.shape[0] * df.select_dtypes(include=np.number).shape[1]) * 100
    print(f"Number of outliers detected by boxplot method: {total_outliers}")
    print(f"Percentage of outliers: {perc_outliers:.2f}%")
    print(df.shape)

outlier_stats(df_clean_dropped)

In [None]:
z_score = np.abs((df_clean_dropped[num_cols] - df_clean_dropped[num_cols].mean())/ df_clean_dropped[num_cols].std())
outliers = (z_score > 3).any(axis=1) #inja all hame feature haro barresi mikone va agar hame out budan True (100% )
df_clean_dropped[outliers].shape
#df[~outliers].shape #unai ke outlier nistan

In [None]:
for col in df_clean_dropped.select_dtypes(include=["float64", "int64"]).columns:   #for numerics fill with median
    df_clean_dropped[col].fillna(df_clean_dropped[col].median(), inplace=True)

In [None]:
df_clean_dropped.isna().sum().sum()

## 🔹 Step 10: remove duplicates

In [None]:
# تعداد ردیف‌های duplicate
num_duplicates = df_clean.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

# حذف duplicates و ذخیره در df_clean_dropped
df_clean_dropped = df_clean.drop_duplicates()
print(f"Shape after dropping duplicates: {df_clean_dropped.shape}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# انتخاب ستون‌های عددی
numeric_cols = df_clean_dropped.select_dtypes(include=[np.number]).columns.tolist()

batch_size = 5
for i in range(0, len(numeric_cols), batch_size):
    cols_batch = numeric_cols[i:i+batch_size]
    df_batch = df_clean_dropped[cols_batch]

    plt.figure(figsize=(12, len(cols_batch) * 1.5))
    for j, col in enumerate(cols_batch, 1):
        plt.subplot(len(cols_batch), 1, j)
        sns.boxplot(x=df_clean_dropped[col], color="skyblue")
        plt.title(f"Boxplot - {col}")
    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# تنظیمات اصلی
target_col = 'SalePrice'   # اسم ستون تارگت
batch_size = 5             # تعداد نمودارها در هر سری

# فقط ستون‌های عددی غیر تارگت
numeric_cols = [col for col in df_clean_dropped.select_dtypes(include=np.number).columns if col != target_col]

# مرتب‌سازی: اول حروف، بعد اعداد
numeric_cols = sorted(numeric_cols, key=lambda x: (str(x)[0].isdigit(), str(x).lower()))

# رسم Scatterplot به صورت Batch
for i in range(0, len(numeric_cols), batch_size):
    cols_batch = numeric_cols[i:i+batch_size]

    plt.figure(figsize=(12, len(cols_batch) * 3))
    for j, col in enumerate(cols_batch, 1):
        plt.subplot(len(cols_batch), 1, j)
        sns.scatterplot(x=df_clean_dropped[col], y=df_clean_dropped[target_col], alpha=0.6, color='teal', edgecolor=None)
        plt.title(f"Scatterplot: {col} vs {target_col}")
        plt.xlabel(col)
        plt.ylabel(target_col)
    plt.tight_layout()
    plt.show()

In [None]:
import metrics_helper
metrics_plot(df_clean_dropped)

## 💾 Step 11: Save Cleaned Dataset

In [None]:
# Save your final cleaned and engineered dataset to CSV.
df_clean_dropped.to_csv("AmesHousing_clean_by_Arianshs.csv", index=False)
print("✅ Cleaned dataset saved successfully!")
