# <center> CLOUD CLAUSE PROJECT <center>
# <center> Predicting House Prices In Bengaluru <center>

## Load libraries

In [3]:
import re
import pickle
import numpy as np
import pandas as pd

import sklearn
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

import xgboost
import lightgbm

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [13]:
PATH = 'Dataset/'
PATH_TO_train_data = 'C:/Users/vigne/OneDrive/Documents/train.csv'
PATH_TO_test_data ='C:/Users/vigne/OneDrive/Documents/test.csv'
PATH_TO_sample_submission ='C:/Users/vigne/OneDrive/sample_submission.xlsx'

In [14]:
def preprocess_total_sqft(my_list):
    if len(my_list) == 1:
        
        try:
            return float(my_list[0])
        except:
            strings = ['Sq. Meter', 'Sq. Yards', 'Perch', 'Acres', 'Cents', 'Guntha', 'Grounds']
            split_list = re.split('(\d*.*\d)', my_list[0])[1:]
            area = float(split_list[0])
            type_of_area = split_list[1]
            
            if type_of_area == 'Sq. Meter':
                area_in_sqft = area * 10.7639
            elif type_of_area == 'Sq. Yards':
                area_in_sqft = area * 9.0
            elif type_of_area == 'Perch':
                area_in_sqft = area * 272.25
            elif type_of_area == 'Acres':
                area_in_sqft = area * 43560.0
            elif type_of_area == 'Cents':
                area_in_sqft = area * 435.61545
            elif type_of_area == 'Guntha':
                area_in_sqft = area * 1089.0
            elif type_of_area == 'Grounds':
                area_in_sqft = area * 2400.0
            return float(area_in_sqft)
        
    else:
        return (float(my_list[0]) + float(my_list[1]))/2.0

In [15]:
train_data = pd.read_csv(PATH_TO_train_data)

In [16]:
test_data = pd.read_csv(PATH_TO_test_data)

In [17]:
train_data.shape

(13320, 9)

In [18]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [19]:
train_data.area_type.value_counts()

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

In [20]:
replace_area_type = {'Super built-up  Area': 0, 'Built-up  Area': 1, 'Plot  Area': 2, 'Carpet  Area': 3}
train_data['area_type'] = train_data.area_type.map(replace_area_type)

In [21]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,0,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [22]:
def replace_availabilty(my_string):
    if my_string == 'Ready To Move':
        return 0
    elif my_string == 'Immediate Possession':
        return 1
    else:
        return 2

In [23]:
train_data['availability'] = train_data.availability.apply(replace_availabilty)

In [24]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1,0,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0,0,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [25]:
train_data[~train_data.location.notnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
568,0,0,,3 BHK,Grare S,1600,3.0,2.0,86.0


In [27]:
train_data['location'] = train_data['location'].fillna('Location not provided')

In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# ... (assuming train_data and test_data are loaded)

# Combine training and testing data for 'size' feature
size_data = pd.concat([train_data['size'].astype('str'), test_data['size'].astype('str')], ignore_index=True)

# Fit the label encoder
size_encoder = LabelEncoder()
size_encoder.fit(size_data)

# Encode training data
train_data['size'] = size_encoder.transform(train_data['size'].astype('str'))

# Optionally encode testing data (if needed for prediction)
test_data['size'] = size_encoder.transform(test_data['size'].astype('str'))


In [30]:
size_encoder.classes_

array(['1 BHK', '1 Bedroom', '1 RK', '10 BHK', '10 Bedroom', '11 BHK',
       '11 Bedroom', '12 Bedroom', '13 BHK', '14 BHK', '16 BHK',
       '16 Bedroom', '18 Bedroom', '19 BHK', '2 BHK', '2 Bedroom',
       '27 BHK', '3 BHK', '3 Bedroom', '4 BHK', '4 Bedroom', '43 Bedroom',
       '5 BHK', '5 Bedroom', '6 BHK', '6 Bedroom', '7 BHK', '7 Bedroom',
       '8 BHK', '8 Bedroom', '9 BHK', '9 Bedroom', 'nan'], dtype=object)

In [31]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,14,Coomee,1056,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,20,Theanmp,2600,5.0,3.0,120.0
2,1,0,Uttarahalli,17,,1440,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,17,Soiewre,1521,3.0,1.0,95.0
4,0,0,Kothanur,14,,1200,2.0,1.0,51.0


In [42]:
# train_data = train_data.drop(columns='society', axis=1)
train_data['society'] = train_data['society'].fillna('Other')

In [44]:
# Remove 'society' column from train_data
train_data = train_data.drop('society', axis=1)

# Remove 'society' column from test_data
test_data = test_data.drop('society', axis=1)

In [45]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,14,1056,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,20,2600,5.0,3.0,120.0
2,1,0,Uttarahalli,17,1440,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,17,1521,3.0,1.0,95.0
4,0,0,Kothanur,14,1200,2.0,1.0,51.0


In [46]:
train_data['total_sqft'] = train_data.total_sqft.str.split('-').apply(preprocess_total_sqft)

In [47]:
train_data['bath'].isna().sum()

73

In [48]:
column_bath = train_data.groupby('location')['bath'].transform(lambda x: x.fillna(x.mean()))

In [49]:
column_bath[~column_bath.notnull()]

1775   NaN
Name: bath, dtype: float64

In [50]:
column_bath = column_bath.fillna(column_bath.mean())
column_bath.isna().sum()

0

In [51]:
column_bath = column_bath.fillna(column_bath.mean())
column_bath.isna().sum()

0

In [52]:
train_data.balcony.isna().sum()

609

In [53]:
train_data.balcony.value_counts()

balcony
2.0    5113
1.0    4897
3.0    1672
0.0    1029
Name: count, dtype: int64

In [54]:
column_balcony = train_data.groupby('location')['balcony'].transform(lambda x: x.fillna(x.mean()))
column_balcony = column_balcony.fillna(column_balcony.mean())

In [55]:
column_balcony.isna().sum()

0

In [56]:
train_data['balcony'] = column_balcony

In [57]:
train_data.head()


Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,14,1056.0,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,20,2600.0,5.0,3.0,120.0
2,1,0,Uttarahalli,17,1440.0,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,17,1521.0,3.0,1.0,95.0
4,0,0,Kothanur,14,1200.0,2.0,1.0,51.0


In [59]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# ... (assuming train_data and test_data are loaded)

# Combine training and testing data for 'location' feature
location_data = pd.concat([train_data['location'], test_data['location']], ignore_index=True)

# Fit the label encoder with the combined data
location_encoder = LabelEncoder()
location_encoder.fit(location_data)

# Encode location in training and testing data (separately)
train_data['location'] = location_encoder.transform(train_data['location'])
test_data['location'] = location_encoder.transform(test_data['location'])


In [60]:
location_encoder.classes_

array([' Anekal', ' Banaswadi', ' Basavangudi', ..., 'whitefiled',
       'yelahanka, north', 'yettagodi Road'], dtype=object)

In [61]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,0,2,430,14,1056.0,2.0,1.0,39.07
1,2,0,325,20,2600.0,5.0,3.0,120.0
2,1,0,1220,17,1440.0,2.0,3.0,62.0
3,0,0,778,17,1521.0,3.0,1.0,95.0
4,0,0,736,14,1200.0,2.0,1.0,51.0


In [62]:
columns = train_data.columns
X_train = train_data[columns[:-1]]
y_train = train_data[columns[-1]]

In [63]:
test_data = pd.read_csv(PATH_TO_test_data)

In [64]:
test_data.isna().sum()

area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64

In [65]:
test_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,Ready To Move,Brookefield,2 BHK,Roeekbl,1225,2.0,2.0,
1,Plot Area,Ready To Move,Akshaya Nagar,9 Bedroom,,2400,9.0,2.0,
2,Plot Area,18-Apr,Hennur Road,4 Bedroom,Saandtt,1650,5.0,2.0,
3,Super built-up Area,Ready To Move,Kodichikkanahalli,3 BHK,Winerri,1322,3.0,1.0,
4,Super built-up Area,Ready To Move,Konanakunte,2 BHK,AmageSa,1161,2.0,1.0,


In [66]:
test_data.loc[test_data.society == 'nan']

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price


In [72]:
X_test = test_data
X_test.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,,2,284,14,Roeekbl,1225,2.0,2.0,
1,,2,103,31,,2400,9.0,2.0,
2,,2,534,20,Saandtt,1650,5.0,2.0,
3,,2,721,17,Winerri,1322,3.0,1.0,
4,,2,727,14,AmageSa,1161,2.0,1.0,


In [None]:
import xgboost as xgb  # Import xgboost after installation
from sklearn.model_selection import GridSearchCV

params = {'min_child_weight': [4, 5, 6],
          'gamma': [i/10.0 for i in range(3, 6)],
          'subsample': [i/10.0 for i in range(6, 11)],
          'colsample_bytree': [i/10.0 for i in range(6, 11)],
          'max_depth': [2, 3, 4],
          'n_estimators': [1000, 1500, 2000],
          'learning_rate': [0.01, 0.05, 0.1]}

xgb = xgboost.XGBRegressor(nthread=-1)

grid_search = GridSearchCV(estimator=xgb, param_grid=params,
                           cv=5,  # 5-fold cross-validation
                           scoring='neg_mean_squared_error',  # Minimize mean squared error
                           verbose=2)  # Print progress reports

# Assuming you have your training data (X_train) and target variable (y_train)
grid_search.fit(X_train, y_train)

# Access the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
