<a href="https://colab.research.google.com/github/belkaaloulmehdi/projects/blob/main/Copie_de_Demo_Day_Notebook_dse_ft_95_Groupe_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Real Estate & House Price Trends in Bangladesh::



# **Data Exploration**

## Libraries import

In [1]:
# Import
import pandas as pd
import numpy as np
import scipy as stats
import seaborn as sns
import matplotlib.pyplot as plt

# Import libraries for Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

## Data cleaning

In [2]:
# Import & visualize dataset
df = pd.read_csv("/content/house_price_bd.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/house_price_bd.csv'

In [None]:
# Print the shape of dataset in the form of (#rows, #columns)
print(df.shape)

In [None]:
# We use the function describe to have the main statistics information
df.describe()

**==> We noticed that 75% of the properties have >= 3 bedrooms >= 3 bathrooms and floor_area < 2000 sqft**

In [None]:
df_count = df.groupby("Bedrooms").size().reset_index(name="count")
sns.barplot(x="Bedrooms", y="count", data=df_count)
plt.xlim(-1, 7)
plt.show()

In [None]:
df_count = df.groupby("Bathrooms").size().reset_index(name="count")
sns.barplot(x="Bathrooms", y="count", data=df_count)
plt.xlim(-1, 7)
plt.show()

In [None]:
bins = np.arange(0, 3000, 100)


df_count = df.groupby("Floor_area").size().reset_index(name="count")
df_count[df_count["Floor_area"] <= 3000]
df_count = pd.cut(df_count["Floor_area"], bins).value_counts().reset_index(name="count")
plt.figure(figsize=(10,6))  # Set the figure size to 10 inches wide and 6 inches tall
sns.barplot(x="Floor_area", y="count", data=df_count, color = "red", )
plt.xticks(np.arange(0, len(df_count), step=2))  # Affiche une étiquette sur deux
plt.xticks(rotation=45, ha='right')  # Rotate the x-axis labels by 45 degrees
plt.tight_layout()  # Adjust the layout to fit the labels
plt.show()

In [None]:
# We use this function to have the number and Dtype of the columns
print(df.info())

**==> We noticed that Floor_no and Price_in_taka are in object type**

In [None]:
# We look at the values of each column
variables_list = df[["Bedrooms","Bathrooms", "Floor_no", "Occupancy_status", "Floor_area", "City", "Price_in_taka"]]
for value in variables_list:
    print(f'Possible value an doccurence for {value} variable are :')
    print(df[[value]].value_counts())

**==> We need to clean Floor_no and Price_in_taka.**

In [None]:
# We checked missing data
df.isnull().sum()

In [None]:
# We checked duplicates
df.duplicated().sum()

In [None]:
# Remove duplicates
df = df.drop_duplicates(keep='last')

# "Title" and "Location" column drop
df = df.drop(["Title", "Location"], axis=1)

# Delete rows in [8th,4th to 8th Backside,A1,A2,A3,A4,A5,A6,A7,0+7,5th,1st,1F,G+7] corresponding to full building
floor_to_remove = ["Merin City - Purbach", "4th to 8th Backside", "G+7", "0+7","A1,A2,A3,A4,A5,A6,A7"]
df = df[~df["Floor_no"].isin(floor_to_remove)]

# Replace 8th, 1st, 1F, 5th
df.loc[df["Floor_no"] == "8th", "Floor_no"] = 8
df.loc[df["Floor_no"] == "1st", "Floor_no"] = 1
df.loc[df["Floor_no"] == "1F", "Floor_no"] = 1
df.loc[df["Floor_no"] == "5th", "Floor_no"] = 5

# Remove rows where both Bedrooms and Bathrooms are null and switch to int because we consider it as residential plots or/and commercial units
df = df[(df["Bedrooms"].notnull()) | (df["Bathrooms"].notnull())]
df = df[(df["Floor_no"].notnull())]
df["Bedrooms"] = df["Bedrooms"].astype(int)
df["Bathrooms"] = df["Bathrooms"].astype(int)
df["Floor_area"] = df["Floor_area"].astype(int)
df["Floor_no"] = df["Floor_no"].astype(str)

# Remove rows where 200 > Floor_area > 3000, Bedrooms > 6, and Bathrooms > 6 as we consider them as Outliers
df = df[(df["Floor_area"] > 200) & (df["Floor_area"] <= 3000)]
df = df[df['Bedrooms'] < 4]
df = df[df['Bathrooms'] < 5]

# Price column to remove ৳ and convert "Price_in_taka" column into float64
df["Price_in_taka"] = df["Price_in_taka"].apply(lambda x: x.replace("৳", "").replace(",", ""))
df["Price_in_taka"] = df["Price_in_taka"].astype(np.float64)

# Check cleaned dataset
df.head()

In [None]:
# Check duplicates
df.duplicated().sum()

In [None]:
# Data clean check
variables_list = df[["Bedrooms","Bathrooms", "Floor_no", "Floor_area", "City", "Occupancy_status"]]
for value in variables_list:
    print(f'Possible value an doccurence for {value} variable are :')
    print(df[[value]].value_counts())

In [None]:
# Check missing data
df.isnull().sum()

## EDA

In [None]:
df.info()

In [None]:
'''df_sorted = df.sort_values(by='Price_in_taka', ascending=False)
sns.catplot(x="City", y="Price_in_taka",data=df_sorted, kind="bar")'''

df_sorted = df.sort_values(by='Price_in_taka', ascending=False)
g = sns.catplot(x="City", y="Price_in_taka",data=df_sorted, kind="bar")
g.set_xticklabels(rotation=45, ha='right')  # Rotation des étiquettes
plt.tight_layout()  # Ajustement automatique des éléments
plt.show()

In [None]:
# Scatterplot with a Trend Line
sns.regplot(x="Floor_area", y="Price_in_taka", scatter=True, color="#42A5F5", line_kws={'color':'blue'}, data=df)

# Add Labels and Title
plt.xlabel('Floor Area')
plt.ylabel('Price')
plt.title('Scatterplot avec une courbe de tendance')

# Display graph
plt.show()

# **Machine Learning**

## Preprocessing 🍳

In [None]:
# Separate target variable Y from features X
print("Separating target variable from features...")

# Choose the columns we want to have as our features
features_list = ["Bedrooms","Bathrooms","Floor_no","Floor_area","City", "Occupancy_status"]

# We add feature_list into our loc
X = df.loc[:,features_list]

# We set "Price_in_taka" as the target variable
y = df.loc[:,"Price_in_taka"]

print("...Done.")

In [None]:
# Divide dataset Train set & Test set
print("Splitting dataset into train set and test set...")

# Then we use train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0)

print("...Done.")

In [None]:
### Training pipeline ###
print("--- Training pipeline ---")
print()

# Encoding categorical features and standardizing numeric features
print("#### X_train BEFORE preprocessing ####")
print(X_train.head())
print()

In [None]:
categorical_column = X.select_dtypes(object).columns
print(categorical_column)

In [None]:
print("Encoding categorical features and standardizing numerical features...")

## StandardScaler to scale data (i.e apply Z-score)
## OneHotEncoder to encode categorical variables

numerical_column = X.select_dtypes(np.number).columns
categorical_column = X.select_dtypes(object).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore') #

In [None]:
# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_column),
        ('num', numeric_transformer, numerical_column)
        ]
    )

X_train = feature_encoder.fit_transform(X_train)
print("...Done.")
print("#### X_train AFTER preprocessing ####")


In [None]:
### Testing pipeline ###
print("--- Testing pipeline ---")

# Standardizing numeric features
print("Standardizing numerical features...")

X_test = feature_encoder.transform(X_test)

print("...Done.")

## Build the model 🏋️‍♂️


In [None]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, y_train) # This steps is the actual training !
print("...Done.")

In [None]:
# Predictions on training set
print("Predictions on training set...")
y_train_pred = regressor.predict(X_train)
print("...Done.")

In [None]:
df.isnull().sum()

In [None]:
# Predictions on test set
print("Predictions on test set...")
y_test_pred = regressor.predict(X_test)
print("...Done.")

## Evaluate the model 🌡️

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

# Print R^2 scores
print("R2 score on training set : ", regressor.score(X_train, y_train))
print("R2 score on test set : ", regressor.score(X_test, y_test))

Depending on the results, we will be able to tell if the model is performing well and whether it is overfitting or not.

* $R^2$ close to 1 means good performance
* $R^2_{train}$ > $R^2_{test}$ means overfitting
* $R^2_{train}$ < $R^2_{test}$ means underfitting

## Feature importance 🥕

In [None]:
print("coefficients are: ", regressor.coef_)
print("Constant is: ", regressor.intercept_)

Now that we have the coefficients, we need to know which columns are associated with each one. If you look at `X_train` (or `X_test`), here is what you have:

In [None]:
feature_encoder.get_feature_names_out()

In [None]:
X_train[:5] # Visualize the first line
print(X_train[0])

But how can we show it in a DataFrame? Well first, we need to use the [`.categories_`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html?highlight=one%20hot%20encoder#sklearn.preprocessing.OneHotEncoder) attribute from [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html?highlight=one%20hot%20encoder#sklearn.preprocessing.OneHotEncoder).

Since we use `ColumnTransformer`, we need to access `OneHotEncoder` using `.transformers_`

In [None]:
# Access transformers from feature_encoder
print("All transformers are: ", feature_encoder.transformers_)

# Access one specific transformer
print("One Hot Encoder transformer is: ", feature_encoder.transformers_[0][1])

Now we can simply check the categories

In [None]:
# Print categories (modifying code)
categorical_column_names = np.concatenate(feature_encoder.transformers_[0][1].categories_)
print("Categorical columns are: ", categorical_column_names)

Now we can concatenate them with the numerical column names. We will use `numeric_features` variable to determine the name of our columns

In [None]:
numerical_column_names = X.loc[:, numerical_column].columns # using the .columns attribute gives us the name of the column
print("numerical columns are: ", numerical_column_names)

Finally, we need to concatenate our `numerical_column_names` and our `categorical_column_names`. The easiest way to do it is by using [np.append](https://numpy.org/doc/stable/reference/generated/numpy.append.html)

In [None]:
# Append all columns
all_column_names = np.append(categorical_column_names, numerical_column_names)
all_column_names

Now we can finally rank all columns by importance using coefficients 🥰

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    "feature_names": feature_encoder.get_feature_names_out(),
    "coefficients":regressor.coef_
})

feature_importance

In [None]:
# Set coefficient to absolute values to rank features
feature_importance["coefficients"] = feature_importance["coefficients"]

In [None]:
# Visualize ranked features using seaborn
sns.catplot(x="feature_names",
            y="coefficients",
            data=feature_importance.sort_values(by="coefficients", ascending=False),
            kind="bar",
            aspect=100/10) # Resize graph

In [None]:
y_pred = regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
# conversion rate € => ৳ : 132
euro_to_taka = 132
print('Mean Absolute Error:', round(mae,2),'৳')
print('Mean Absolute Error:', round(mae/euro_to_taka,2),'€')