In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#load data
data = pd.read_csv(r'C:\Users\WhiteDevil\Desktop\datascience_libraries\Project1\csv_file\Bengaluru_House_Data.csv')
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
#Copy original data into df
df = data.copy()

# DATA CLEANING

In [4]:
#Check number of null values in dataset
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [5]:
#sociey attribute has 42.323 % null values, so drop society column
df = df.drop(['society'], axis = 1)

In [6]:
#drop rows containing null values
df = df.dropna(axis = 0, subset=['location','balcony'])
#Chech number of null values
df.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [7]:
#extracting info from size feature
#method 1
df['bhk']=[int(x.split(" ")[0]) for x in df['size']]
df.drop(['size'], axis = 1, inplace = True)
df.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200,2.0,1.0,51.0,2


In [8]:
#extracting info from size feature
#method 2
'''
a = df['size']
b = []
for x in a:
    b.append(int(x.split()[0]))
z = pd.DataFrame(b)
z
'''

"\na = df['size']\nb = []\nfor x in a:\n    b.append(int(x.split()[0]))\nz = pd.DataFrame(b)\nz\n"

In [9]:
#extracting info from size feature
#method 3
# df['size'].apply(lambda x : int(x.split(" ")[0]))

In [10]:
df['bhk'].unique()

array([ 2,  4,  3,  1,  6,  8,  7,  5, 11,  9, 27, 43, 14, 12, 10, 13],
      dtype=int64)

In [11]:
#function of converting into float
def float_con(x):
    try:
        float(x)
    except:
        return False
    return True

In [12]:
df[~df['total_sqft'].apply( float_con)].head(10)

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
30,Super built-up Area,19-Dec,Yelahanka,2100 - 2850,4.0,0.0,186.0,4
122,Super built-up Area,18-Mar,Hebbal,3067 - 8156,4.0,0.0,477.0,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,1042 - 1105,2.0,0.0,54.005,2
165,Super built-up Area,18-Dec,Sarjapur,1145 - 1340,2.0,0.0,43.49,2
188,Super built-up Area,Ready To Move,KR Puram,1015 - 1540,2.0,0.0,56.8,2
410,Super built-up Area,Ready To Move,Kengeri,34.46Sq. Meter,1.0,0.0,18.5,1
549,Super built-up Area,18-Sep,Hennur Road,1195 - 1440,2.0,0.0,63.77,2
661,Super built-up Area,Ready To Move,Yelahanka,1120 - 1145,2.0,0.0,48.13,2
672,Built-up Area,18-Mar,Bettahalsoor,3090 - 5002,4.0,0.0,445.0,4
772,Super built-up Area,20-Dec,Banashankari Stage VI,1160 - 1195,2.0,0.0,59.935,2


In [13]:
#function of converting range value and other data type value into float
def con_sqft_float(x):
    tokens = x.split("-")
    if len(tokens)==2:
        a = (float(tokens[0]) + float(tokens[1]))/2
        return (a)
    try:
        return float(x)
    except:
        return None

In [14]:
#apply con_sqft_float() function
df['total_sqft'] = df['total_sqft'].apply(con_sqft_float)


In [15]:
df.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.0,2


In [16]:
#outlier removal from location column
df.location = df.location.apply(lambda x: x.strip())
a = df.groupby('location')['location'].agg('count').sort_values(ascending = False)
less_than_10 = a[a<=10]


In [17]:
df['location'] = df['location'].apply(lambda x: 'other' if x in less_than_10 else x)
df.location.value_counts()

other                        2739
Whitefield                    515
Sarjapur  Road                372
Electronic City               302
Kanakpura Road                261
                             ... 
Pattandur Agrahara             11
Marsur                         11
2nd Phase Judicial Layout      11
Prithvi Layout                 11
Nagasandra                     11
Name: location, Length: 238, dtype: int64

In [18]:
#find price per square fit for outlier removal
df['price_sqr_fit'] = df['price']*100000/df['total_sqft']
df.tail(20)

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk,price_sqr_fit
13297,Super built-up Area,Ready To Move,Electronic City,1060.0,2.0,1.0,52.0,2,4905.660377
13298,Super built-up Area,Ready To Move,Kenchenahalli,1015.0,2.0,2.0,60.0,2,5911.330049
13299,Super built-up Area,18-Dec,Whitefield,2856.0,5.0,0.0,154.5,4,5409.663866
13300,Plot Area,Ready To Move,Hosakerehalli,1500.0,6.0,2.0,145.0,5,9666.666667
13301,Super built-up Area,Ready To Move,Kothanur,1454.0,3.0,3.0,71.5,3,4917.469051
13302,Super built-up Area,Ready To Move,other,1075.0,2.0,2.0,48.0,2,4465.116279
13303,Plot Area,Ready To Move,Vidyaranyapura,774.0,5.0,3.0,70.0,5,9043.927649
13304,Super built-up Area,Ready To Move,Raja Rajeshwari Nagar,1187.0,2.0,2.0,40.14,2,3381.634372
13305,Carpet Area,Ready To Move,Hulimavu,500.0,1.0,3.0,220.0,1,44000.0
13307,Built-up Area,Ready To Move,Billekahalli,1805.0,3.0,3.0,134.0,3,7423.822715


In [19]:
#drop availability
df.drop(['availability'], axis = 1, inplace = True)

In [20]:
#remove outlier rows with one bhk size less than 300
df1 = df[~(df['total_sqft']/df['bhk'] < 300)]
df1.shape

(12055, 8)

In [21]:
df1.tail(20)

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_sqr_fit
13295,Super built-up Area,Haralur Road,1810.0,3.0,2.0,112.0,3,6187.845304
13296,Super built-up Area,Cox Town,1200.0,2.0,2.0,140.0,2,11666.666667
13297,Super built-up Area,Electronic City,1060.0,2.0,1.0,52.0,2,4905.660377
13298,Super built-up Area,Kenchenahalli,1015.0,2.0,2.0,60.0,2,5911.330049
13299,Super built-up Area,Whitefield,2856.0,5.0,0.0,154.5,4,5409.663866
13300,Plot Area,Hosakerehalli,1500.0,6.0,2.0,145.0,5,9666.666667
13301,Super built-up Area,Kothanur,1454.0,3.0,3.0,71.5,3,4917.469051
13302,Super built-up Area,other,1075.0,2.0,2.0,48.0,2,4465.116279
13304,Super built-up Area,Raja Rajeshwari Nagar,1187.0,2.0,2.0,40.14,2,3381.634372
13305,Carpet Area,Hulimavu,500.0,1.0,3.0,220.0,1,44000.0


In [22]:
df1.price_sqr_fit.describe()

count     12013.000000
mean       6206.082347
std        3985.518807
min         267.829813
25%        4199.363057
50%        5252.525253
75%        6823.529412
max      176470.588235
Name: price_sqr_fit, dtype: float64

In [23]:
#remove outlier from price_sqr_fit by selecting one standard deviation data
def rem_outlier(df1):
    df_out = pd.DataFrame()
    for key,sub_df1 in df1.groupby('location'):
        m_value = np.mean(sub_df1.price_sqr_fit)
        std_dev_val = np.std(sub_df1.price_sqr_fit)
        new_df1 = sub_df1[(sub_df1.price_sqr_fit > (m_value - std_dev_val)) & (sub_df1.price_sqr_fit <= (m_value + std_dev_val))]
        df_out = pd.concat([df_out,new_df1],ignore_index=True)
    return df_out

In [24]:
#apply rem_outlier function on dataset
df2 = rem_outlier(df1)

In [25]:
df2.shape

(9848, 8)

In [26]:
df2.head(40)

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_sqr_fit
0,Super built-up Area,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4,15017.54386
1,Super built-up Area,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491
2,Super built-up Area,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3,12533.333333
3,Built-up Area,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3,10833.333333
4,Super built-up Area,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2,11983.805668
5,Super built-up Area,1st Block Jayanagar,2750.0,4.0,0.0,413.0,4,15018.181818
6,Super built-up Area,1st Block Jayanagar,2450.0,4.0,2.0,368.0,4,15020.408163
7,Super built-up Area,1st Phase JP Nagar,2825.0,4.0,3.0,250.0,4,8849.557522
8,Super built-up Area,1st Phase JP Nagar,1875.0,3.0,1.0,167.0,3,8906.666667
9,Super built-up Area,1st Phase JP Nagar,2065.0,4.0,1.0,210.0,3,10169.491525


In [27]:
#remove records having no. of bathroom > no. of bhk
df3 = df2[~(df2['bath']>df2['bhk'])]

In [28]:
df3.shape

(9343, 8)

In [29]:
a = df.groupby('area_type')

In [30]:
#encode categorial feature using one hot encoding
area_encode = pd.get_dummies(df3['area_type'],prefix = 'area', drop_first= True)
area_encode

Unnamed: 0,area_Carpet Area,area_Plot Area,area_Super built-up Area
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,0
4,0,0,1
...,...,...,...
9843,0,0,0
9844,0,0,1
9845,0,1,0
9846,0,0,1


In [31]:
#encode categorial feature using one hot encoding
loc_encode = pd.get_dummies(df3['location'],prefix = 'loc', drop_first= True)
loc_encode

Unnamed: 0,loc_1st Phase JP Nagar,loc_2nd Phase Judicial Layout,loc_2nd Stage Nagarbhavi,loc_5th Phase JP Nagar,loc_6th Phase JP Nagar,loc_7th Phase JP Nagar,loc_8th Phase JP Nagar,loc_9th Phase JP Nagar,loc_AECS Layout,loc_Abbigere,...,loc_Vishveshwarya Layout,loc_Vishwapriya Layout,loc_Vittasandra,loc_Whitefield,loc_Yelachenahalli,loc_Yelahanka,loc_Yelahanka New Town,loc_Yelenahalli,loc_Yeshwanthpur,loc_other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [32]:
#concat original dataframe and encoded sub_dataframe
df3 = pd.concat([area_encode,loc_encode,df3], axis = 1)

In [33]:
df3.head()

Unnamed: 0,area_Carpet Area,area_Plot Area,area_Super built-up Area,loc_1st Phase JP Nagar,loc_2nd Phase Judicial Layout,loc_2nd Stage Nagarbhavi,loc_5th Phase JP Nagar,loc_6th Phase JP Nagar,loc_7th Phase JP Nagar,loc_8th Phase JP Nagar,...,loc_Yeshwanthpur,loc_other,area_type,location,total_sqft,bath,balcony,price,bhk,price_sqr_fit
0,0,0,1,0,0,0,0,0,0,0,...,0,0,Super built-up Area,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4,15017.54386
1,0,0,1,0,0,0,0,0,0,0,...,0,0,Super built-up Area,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491
2,0,0,1,0,0,0,0,0,0,0,...,0,0,Super built-up Area,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3,12533.333333
3,0,0,0,0,0,0,0,0,0,0,...,0,0,Built-up Area,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3,10833.333333
4,0,0,1,0,0,0,0,0,0,0,...,0,0,Super built-up Area,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2,11983.805668


In [34]:
#drop categorial data after encoding
df3.drop(['price_sqr_fit','area_type','location'], axis = 1, inplace = True)

In [35]:
df3.shape

(9343, 245)

# Feature Scaling

In [36]:
#import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

In [37]:
scaling = MinMaxScaler()

In [38]:
#scalinng total_sqft within range 0-1
df3['Scaled_total_sqft'] = scaling.fit_transform(df3[['total_sqft']])

In [39]:
df3.head()

Unnamed: 0,area_Carpet Area,area_Plot Area,area_Super built-up Area,loc_1st Phase JP Nagar,loc_2nd Phase Judicial Layout,loc_2nd Stage Nagarbhavi,loc_5th Phase JP Nagar,loc_6th Phase JP Nagar,loc_7th Phase JP Nagar,loc_8th Phase JP Nagar,...,loc_Yelahanka New Town,loc_Yelenahalli,loc_Yeshwanthpur,loc_other,total_sqft,bath,balcony,price,bhk,Scaled_total_sqft
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,2850.0,4.0,1.0,428.0,4,0.084718
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1630.0,3.0,2.0,194.0,3,0.044186
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1875.0,2.0,3.0,235.0,3,0.052326
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1200.0,2.0,0.0,130.0,3,0.0299
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1235.0,2.0,2.0,148.0,2,0.031063


In [40]:
#drop original after scaling
df3.drop(['total_sqft'], axis=1)

Unnamed: 0,area_Carpet Area,area_Plot Area,area_Super built-up Area,loc_1st Phase JP Nagar,loc_2nd Phase Judicial Layout,loc_2nd Stage Nagarbhavi,loc_5th Phase JP Nagar,loc_6th Phase JP Nagar,loc_7th Phase JP Nagar,loc_8th Phase JP Nagar,...,loc_Yelahanka,loc_Yelahanka New Town,loc_Yelenahalli,loc_Yeshwanthpur,loc_other,bath,balcony,price,bhk,Scaled_total_sqft
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,4.0,1.0,428.00,4,0.084718
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,3.0,2.0,194.00,3,0.044186
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2.0,3.0,235.00,3,0.052326
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2.0,0.0,130.00,3,0.029900
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2.0,2.0,148.00,2,0.031063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,2.0,1.0,65.00,2,0.031761
9844,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,2.0,2.0,110.00,2,0.034983
9845,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1.0,0.0,26.00,1,0.017010
9846,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,2.0,2.0,63.93,3,0.037874


In [41]:
Y = df3['price']
Y.shape

(9343,)

In [42]:
X = df3.drop(['price'],axis = 1)

In [43]:
X.shape

(9343, 245)

In [44]:
'''
from sklearn.decomposition import PCA
pca = PCA(n_components=150)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)
'''

'\nfrom sklearn.decomposition import PCA\npca = PCA(n_components=150)\nprincipalComponents = pca.fit_transform(X)\nprincipalDf = pd.DataFrame(data = principalComponents)\n'

# Model Creation

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [46]:
#split data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2)

In [47]:
#instance of Model
regressor = LinearRegression()
#fit model on training data
regressor.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [48]:
#make prediction using trained model
y_prid = regressor.predict(X_test)
y_prid = pd.Series(y_prid)

In [49]:
#Chack accuracy of model
regressor.score(X_test,Y_test)

0.7186841178440313

In [50]:
#make prediction using cross validation
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.model_selection import cross_val_score

In [51]:
cv = ShuffleSplit(n_splits= 7, test_size = 0.2)

In [52]:
a = cross_val_score(LinearRegression(),X,Y, cv = cv)

In [53]:
a

array([0.78623038, 0.80750021, 0.7885705 , 0.84773067, 0.80265278,
       0.6983883 , 0.79896107])