In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
data= pd.read_csv('./datasets/train.csv')
col_with_null = ['Mall_Nearest_Distance', 'Mall_Within_2km', 'Mall_Within_1km', 'Hawker_Within_2km', 'Hawker_Within_1km', 'Mall_Within_500m', 'Hawker_Within_500m']
imputer=SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value=0)
for x in col_with_null:
    data[x]=imputer.fit_transform(data[x].values.reshape(-1,1))

In [6]:
data.select_dtypes(include='object').columns

Index(['Tranc_YearMonth', 'town', 'flat_type', 'block', 'street_name',
       'storey_range', 'flat_model', 'full_flat_type', 'address',
       'residential', 'commercial', 'market_hawker', 'multistorey_carpark',
       'precinct_pavilion', 'postal', 'planning_area', 'mrt_name',
       'bus_stop_name', 'pri_sch_name', 'sec_sch_name'],
      dtype='object')

In [7]:
col = ['town', 'flat_type', 'flat_model', 'full_flat_type', 'planning_area', 'pri_sch_name', 'sec_sch_name']

In [8]:
encoded_data = pd.get_dummies(data, columns=col, drop_first=True)

In [9]:
selected_col = ['floor_area_sqm', 'lease_commence_date', 'mid_storey', 'lower', 'upper',
       'mid', 'floor_area_sqft', 'hdb_age', 'max_floor_lvl',
       'year_completed', '3room_sold', '5room_sold', 'exec_sold', 'Latitude',
       'Hawker_Within_2km', 'mrt_latitude', 'bus_stop_latitude',
       'pri_sch_latitude', 'sec_sch_latitude', 'town_BISHAN',
       'town_BUKIT MERAH', 'flat_type_2 ROOM', 'flat_type_3 ROOM',
       'flat_type_5 ROOM', 'flat_type_EXECUTIVE', 'flat_model_Apartment',
       'flat_model_DBSS', 'flat_model_Maisonette', 'flat_model_New Generation',
       'full_flat_type_3 ROOM Improved', 'full_flat_type_3 ROOM Model A',
       'full_flat_type_3 ROOM New Generation', 'full_flat_type_5 ROOM DBSS',
       'full_flat_type_5 ROOM Improved', 'full_flat_type_EXECUTIVE Apartment',
       'full_flat_type_EXECUTIVE Maisonette', 'planning_area_Bishan',
       'planning_area_Bukit Merah', 'pri_sch_name_Cantonment Primary School',
       'sec_sch_name_Outram Secondary School']

In [10]:
encoded_data_new = encoded_data[selected_col]

In [11]:
X = encoded_data_new
y = data[['resale_price']]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.intercept_)
print(lr.coef_)
print('Linear Regression score:',lr.score(X_train,y_train))
lrpred=lr.predict(X_test)
print('Mean Squared Error: ',mean_squared_error(y_test,lrpred))
print('Mean Absolute Error: ',mean_absolute_error(y_test,lrpred))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,lrpred)))
print('Test Score: ',lr.score(X_test, y_test))

[-2564413.16863358]
[[-3.42822164e+12  9.89611450e+02  8.53822894e+02 -9.47716223e+03
   1.11848080e+04  8.53822893e+02  3.18489562e+11 -9.89611508e+02
   3.65007174e+03  7.78540771e+02 -1.49822312e+02 -2.07645366e+02
  -3.94599214e+02 -4.81988634e+05  1.06280359e+04  1.35333327e+06
   4.77505450e+05 -1.18836785e+06 -8.56928486e+05  4.05466256e+04
  -1.73546396e+04 -4.55636432e+04  9.68124426e+03  1.81251108e+04
   4.25171040e+04  1.46494901e+04  1.07466954e+05  3.43331854e+04
   9.84627589e+03 -2.33082782e+04 -1.51015742e+04 -1.67627380e+04
   6.29147316e+04 -2.01720030e+03  1.46494901e+04  3.43331854e+04
   4.05466256e+04 -1.73546396e+04  4.07148385e+04  2.78398990e+04]]
Linear Regression score: 0.8015883079229069
Mean Squared Error:  4032242099.4096074
Mean Absolute Error:  48606.993877737514
Root Mean Squared Error:  63499.937790596356
Test Score:  0.8036483850864166


In [14]:
r2_score(y_test,lrpred)

0.8036483850864166

In [15]:
lr.score(X_train, y_train)

0.8015883079229069

In [16]:
lr.score(X_test, y_test)

0.8036483850864166

In [19]:
-cross_val_score(lr, X_train, y_train, cv=20, scoring = "neg_root_mean_squared_error")

array([65236.28050559, 64797.99313689, 64115.50629148, 64312.59497259,
       64322.55218027, 63067.2957844 , 63392.69464449, 64133.2816785 ,
       64351.18435185, 64168.25257826, 63873.06784208, 63225.26996523,
       62877.58862117, 63005.0271779 , 63634.35059737, 63729.79012593,
       64420.40352657, 62854.28940723, 64067.56953162, 63536.71321308])

--------------------

In [20]:
testdata= pd.read_csv('test.csv')
imputer=SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value=0)
for x in col_with_null:
    testdata[x]=imputer.fit_transform(testdata[x].values.reshape(-1,1))

FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'

In [21]:
test_data_export = testdata['id'].to_frame()

In [24]:
encoded_test_data = pd.get_dummies(testdata, columns=col, drop_first=True)

In [25]:
encoded_test_data_new = encoded_test_data[selected_col]

In [26]:
lrpred=lr.predict(encoded_test_data_new)

In [27]:
test_data_export['Predicted'] = lrpred

In [30]:
test_data_export.columns = ['Id', 'Predicted']

In [32]:
test_data_export.to_csv('test_modified_1.csv')