In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [21]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room','floor_category','balcony'])

In [22]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,2.0,New Property,850.0,0.0,0.0,Low
1,flat,sector 89,0.95,2.0,2.0,New Property,1226.0,1.0,0.0,Low
2,flat,sohna road,0.32,2.0,2.0,New Property,1000.0,0.0,0.0,Low
3,flat,sector 92,1.6,3.0,4.0,Relatively New,1615.0,1.0,1.0,High
4,flat,sector 102,0.48,2.0,2.0,Relatively New,582.0,0.0,0.0,High


In [23]:
df['agePossession'] = df['agePossession'].replace(
    {
        'Relatively New':'new',
        'Moderately Old':'old',
        'New Property' : 'new',
        'Old Property' : 'old',
        'Under Construction' : 'under construction'
    }
)

In [25]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,2.0,new,850.0,0.0,0.0,Low
1,flat,sector 89,0.95,2.0,2.0,new,1226.0,1.0,0.0,Low
2,flat,sohna road,0.32,2.0,2.0,new,1000.0,0.0,0.0,Low
3,flat,sector 92,1.6,3.0,4.0,new,1615.0,1.0,1.0,High
4,flat,sector 102,0.48,2.0,2.0,new,582.0,0.0,0.0,High


In [26]:
df['property_type'] = df['property_type'].replace({'flat':0,'house':1})

In [27]:
df['luxury_category'] = df['luxury_category'].replace({'Low' : 0 , 'Medium' :1 , 'High':2})

In [28]:
new_df = pd.get_dummies(df , columns= [ 'sector' , 'agePossession'] , drop_first=True)

In [29]:
x = new_df.drop(columns=['price'])
y = new_df['price']

In [30]:
y_log = np.log1p(y)

In [31]:
y_log

0       0.598837
1       0.667829
2       0.277632
3       0.955511
4       0.392042
          ...   
3549    0.314811
3550    1.945910
3551    0.470004
3552    2.803360
3553    1.022451
Name: price, Length: 3554, dtype: float64

In [32]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [33]:
kfold = KFold(n_splits=10 , shuffle=True , random_state=42)
scores= cross_val_score(LinearRegression() , x_scaled , y_log , cv = kfold , scoring='r2')

In [34]:
scores.mean()

0.8512613057405426

In [35]:
scores.std()

0.016992929105286242

In [36]:
lr = LinearRegression()
ridge = Ridge(alpha=0.0001)

In [37]:
lr.fit(x_scaled , y_log)

In [38]:
ridge.fit(x_scaled , y_log)

In [40]:
coef_df = pd.DataFrame(ridge.coef_.reshape(1 ,112) , columns=x.columns).stack().reset_index().drop(columns= ['level_0']).rename(columns = {'level1':'feature' , 0:'coef'})

In [41]:
coef_df

Unnamed: 0,level_1,coef
0,property_type,0.120165
1,bedRoom,0.054002
2,bathroom,0.065119
3,built_up_area,0.210638
4,servant room,0.050946
...,...,...
107,sector_sector 95,-0.025222
108,sector_sector 99,-0.010312
109,sector_sohna road,-0.029515
110,agePossession_old,-0.007900


In [42]:
import statsmodels.api as sm

x_with_const = sm.add_constant(x_scaled)
model = sm.OLS(y_log , x_with_const).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                     196.7
Date:                Sat, 14 Oct 2023   Prob (F-statistic):               0.00
Time:                        00:38:25   Log-Likelihood:                 588.22
No. Observations:                3554   AIC:                            -950.4
Df Residuals:                    3441   BIC:                            -252.6
Df Model:                         112                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0464      0.003    299.336      0.0