In [106]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [107]:
df = pd.read_csv('../Data_Cleaning/Data/gurgaon_properties_post_feature_selection_v3.csv').drop(columns=['store room', 'floor_category', 'balcony',"bathroom","servant room"])

In [108]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,agePossession,built_up_area,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,New Property,850.0,0.0,Low
1,flat,sector 89,0.95,2.0,New Property,1226.0,0.0,Low
2,flat,sohna road,0.32,2.0,New Property,1000.0,0.0,Low
3,flat,sector 92,1.6,3.0,Relatively New,1615.0,1.0,High
4,flat,sector 102,0.48,2.0,Relatively New,582.0,0.0,High


In [109]:
df['agePossession'] = df['agePossession'].replace({'Relatively New':'new', 'Moderately Old':'old', 'New Property' : 'new', 'Old Property' : 'old', 'Under Construction' : 'under construction'})

In [110]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,agePossession,built_up_area,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,new,850.0,0.0,Low
1,flat,sector 89,0.95,2.0,new,1226.0,0.0,Low
2,flat,sohna road,0.32,2.0,new,1000.0,0.0,Low
3,flat,sector 92,1.6,3.0,new,1615.0,1.0,High
4,flat,sector 102,0.48,2.0,new,582.0,0.0,High


In [111]:
df['property_type'] = df['property_type'].replace({'flat':0,'house':1})

In [112]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,agePossession,built_up_area,furnishing_type,luxury_category
0,0,sector 36,0.82,3.0,new,850.0,0.0,Low
1,0,sector 89,0.95,2.0,new,1226.0,0.0,Low
2,0,sohna road,0.32,2.0,new,1000.0,0.0,Low
3,0,sector 92,1.6,3.0,new,1615.0,1.0,High
4,0,sector 102,0.48,2.0,new,582.0,0.0,High


In [113]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1, 'High':2})

In [114]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,agePossession,built_up_area,furnishing_type,luxury_category
0,0,sector 36,0.82,3.0,new,850.0,0.0,0
1,0,sector 89,0.95,2.0,new,1226.0,0.0,0
2,0,sohna road,0.32,2.0,new,1000.0,0.0,0
3,0,sector 92,1.6,3.0,new,1615.0,1.0,2
4,0,sector 102,0.48,2.0,new,582.0,0.0,2


In [115]:
new_df = pd.get_dummies(df, columns=['sector', 'agePossession'], drop_first=True)

In [116]:
X = new_df.drop(columns=['price'])
y = new_df['price']

In [117]:
y_log = np.log1p(y)

In [118]:
y_log

0       0.598837
1       0.667829
2       0.277632
3       0.955511
4       0.392042
          ...   
3549    0.314811
3550    1.945910
3551    0.470004
3552    2.803360
3553    1.022451
Name: price, Length: 3554, dtype: float64

In [119]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [120]:
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [121]:
kfold = KFold(n_splits=10, shuffle=True, random_state=20)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [122]:
scores.mean(), scores.std()

(0.8396764828155492, 0.021080445673704262)

In [123]:
lr = LinearRegression()

In [124]:
lr.fit(X_scaled, y_log)

In [125]:
lr.coef_.shape

(110,)

In [126]:
coef_df = pd.DataFrame(lr.coef_.reshape(1, 110), columns=X.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'feature',0:'coef'})

In [127]:
coef_df

Unnamed: 0,feature,coef
0,property_type,0.121879
1,bedRoom,0.106892
2,built_up_area,0.230820
3,furnishing_type,0.015292
4,luxury_category,0.016764
...,...,...
105,sector_sector 95,-0.031896
106,sector_sector 99,-0.011171
107,sector_sohna road,-0.040403
108,agePossession_old,-0.011127


In [128]:
import statsmodels.api as sm

X_with_const = sm.add_constant(X_scaled)
model = sm.OLS(y_log, X_with_const).fit()

In [129]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.854
Model:                            OLS   Adj. R-squared:                  0.849
Method:                 Least Squares   F-statistic:                     182.6
Date:                Mon, 04 Mar 2024   Prob (F-statistic):               0.00
Time:                        16:52:22   Log-Likelihood:                 446.38
No. Observations:                3554   AIC:                            -670.8
Df Residuals:                    3443   BIC:                             14.75
Df Model:                         110                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [130]:
y_log.std()

0.5579613263072782

In [131]:
import pickle

In [132]:
# with open('coef_df.pkl', 'wb') as file:
#     pickle.dump(coef_df, file)