In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

In [6]:
##load dataset 

In [9]:
df1 = pd.read_csv("bengaluru_house_prices.csv")
df1.head()

FileNotFoundError: [Errno 2] No such file or directory: 'bengaluru_house_prices.csv'

In [None]:
df1.shape

In [None]:
df1.dtypes

In [None]:
## Drop features that are not required to build our model

In [None]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.info()

In [None]:
## Begin data Cleaning
df2.isnull().sum()

In [None]:
# handle missing values: drop all missing values
df3 = df2.dropna()

In [None]:
df2.isna().sum()

In [None]:
df3.isna().sum()

In [None]:
#function to split raw values: create a new column bedrooms
# size

In [None]:
# refine size column 
df3['bedrooms'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
df3

In [None]:
#next we examine the total_sqft feature: make its values in a single unit
df3.total_sqft.unique()

In [None]:
#check total variations in the column
def is_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return x

In [None]:
# is_object(1384-232)

In [None]:
#  return sqft values that are not float 
df_non_float = df3[df3['total_sqft'].apply(lambda x: isinstance(is_float(x), str))]
df_non_float['total_sqft'].head(10)

In [None]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   

In [None]:
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.head(2)


In [None]:
df4

## Feature Engineering
add new feature price per square feet: an important variable real estate

In [None]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()

In [None]:
len(df5.location.unique())

In [None]:
location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

## Examine variables.
We need to apply dimensionality reduction technique here to reduce number of locations

In [None]:
location_stats = df5['location'].value_counts(ascending=False)
location_stats

In [None]:
len(location_stats[location_stats<10])

Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [None]:

location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
df5['location'] = df5['location'].apply(lambda x: "other" if x in location_stats_less_than_10 else x )

In [None]:
df5.location.nunique()

## Outlier Detection and removal : using mean and standard deviation
Data scienties typically have conversations business managers (who will have expertise in a given field). In real estate, they might tell the analyst the avg. square ft per bedroom figure, say 300 (i.e. 2 bhk apartment is minimum 600 sqft. If the dataset has records with 400 sqft apartment having 2 bedrooms then that can be removed as an outlier. In the following we will utilize 300 as our minimun
<!-- df5.price_per_sqft.describe() -->

In [None]:
df5[df5.total_sqft/df5.bedrooms<300].head()

In [None]:
df6 = df5[~(df5.total_sqft/df5.bedrooms<300)]
df6

In our next analysis, we find that min price per sqft is 267 rs/sqft whereas max is 12000000, this shows a wide variation in property prices. Thus, we remove outliers per location using mean and one standard deviation

In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

 for a given location how does the 2 bedroom and 3 bedroom property prices compare


In [None]:
def plot_scatter_chart(df,location):
    bhk2 = df7[(df7.location==location) & (df7.bedrooms==2)]
    bhk3 = df7[(df7.location==location) & (df7.bedrooms==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
    
plot_scatter_chart(df7,"Rajaji Nagar")

In [None]:
# location b: Hebbal
plot_scatter_chart(df7,"Hebbal")

In [None]:
## check distribution using histograms 
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df7.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

In [None]:
# check bathroom distribution
df7.bath.describe()

In [None]:
plt.hist(df7.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

In [None]:
## drop non-contributing columns: 

In [None]:
df7.head()

In [None]:
df8 = df7.drop(['size','price_per_sqft'], axis =1)
df8.dtypes

In [None]:

# df9.head()
dummies = pd.get_dummies(df8.location, columns='locations')
converted_dummies = dummies.astype(int)

In [None]:
df9 = pd.concat([df8,converted_dummies],axis='columns')
df9.head()

In [None]:
#drop the location column 
df10 = df9.drop('location',axis=1)
df10.head()

## Data Preparation and Model Buiding

We will use 4 regressor algorithms and select the best using GridsearchCV 

In [None]:
#split X and y variables 
X = df10.iloc[:, df10.columns != 'price']
y = df10.iloc[:, df10.columns == 'price']
# print(f"X {X} and Y {y}")
# y

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
best_params = {
    "n_estimators": 200,
    "max_depth": 3,
    "learning_rate": 0.2,
    "loss": "squared_error"
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

1. XGBoost Model 

In [None]:
XGmodel = GradientBoostingRegressor(**best_params)
XGmodel.fit(X_train, y_train)
print(f"First five House predictions: {XGmodel.predict(X_train.head())} and the Score: {XGmodel.score(X_test, y_test)}")
mse = mean_squared_error(y_test, XGmodel.predict(X_test))
print("\n The mean squared error (MSE) on test set: {:.4f}".format(mse))

2. LinearRegressor 

In [None]:
from sklearn.linear_model import LinearRegression


lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
mse = mean_squared_error(y_test, lr_clf.predict(X_test))
print(f"First five House predictions: {lr_clf.predict(X_train.head())} and the Model Score is : {lr_clf.score(X_test, y_test)}")
print("\nThe mean squared error (MSE) on test set: {:.4f}".format(mse))

Using K-fold cross validation to measure accuracy of of the Regressor models

In [None]:
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=3, test_size=0.1, random_state=0)

print (f"Linear Regressor Cross-Val Score: {cross_val_score(lr_clf, X, y, cv=cv)}")
print (f"Extreme Gradient Booster Regressor Cross-Val Score: {cross_val_score(XGmodel, X, y, cv=cv)}")

In [None]:
## From the Observations above, None of the Model could maintain 80 percent accuracy over 5 training iterations

## Using GridSearch for Parameter tuning and Validation: 
We will also use RandomForest and Lasso Regressor

In [None]:
# pip install xgboost

In [None]:

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False],
                'copy_X': [True, False]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.01, 0.1],
                'max_depth': [3,5],
           
                'random_state': [42]
            }
        },
        'XGBoost': {
            'model': XGBRegressor(),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3,5]
            
            }
        },
         'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'splitter': ['best','random']
            }},
        'RandomForest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [100, 500],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 10],
                'min_samples_leaf': [1, 4]
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
    
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False, n_jobs=-1)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
 

In [None]:
   # Call the function
results = find_best_model_using_gridsearchcv(X, y)
    
    # Print results
results.sort_values(by='best_score', ascending =False)


In [None]:
results.sort_values(by='best_score', ascending =False)

In [None]:
def predict_price(location, sqft, bath, bedroom):
    pass
    #   # Encode 'location' using the same encoder
    
    # location_encoded = le.transform([location])[0]
    #   # Create input features as numpy array
    # features = pd.DataFrame({
    #     'location': [location_encoded],  # Example: pass location as string
    #     'total_sqft': [sqft],    # Example: pass total square feet as numerical value
    #     'bath': [bath],          # Example: pass number of bathrooms as numerical value
    #     'bedrooms': [bedroom]     # Example: pass number of bedrooms as numerical value
    # })
    
    # # Assuming `model` is your trained XGBoost model
    # predicted_price = XGmodel.predict(features)
    
    # return predicted_price[0]  

In [None]:
# location = 'Richards Town'
# sqft = 1500
# bath = 2
# bedrooms = 3
# predicted_price = predict_price(location, sqft, bath, bedroom)
# print(f"Predicted price for the property: ${predicted_price:.2f}")


In [None]:
df1.tail()