In [15]:
import pandas as pd
from linear_regression_numpy import *

In [16]:
housing=pd.read_csv('housing.csv')
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [17]:
median_of_total_bedrooms=housing['total_bedrooms'].median()
housing['total_bedrooms'] = housing['total_bedrooms'].fillna(median_of_total_bedrooms)


In [18]:
corr_matrix = housing.select_dtypes(include=['number']).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049457
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [19]:
housing['rooms_per_house']=housing['total_rooms']/housing['households']
housing['bedrooms_per_house']=housing['total_bedrooms']/housing['households']
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["rooms_per_person"] = housing["total_rooms"]/housing["population"]

corr_matrix = housing.select_dtypes(include=['number']).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
rooms_per_person      0.209482
rooms_per_house       0.151948
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049457
population           -0.024650
bedrooms_per_house   -0.045637
longitude            -0.045967
latitude             -0.144160
bedrooms_per_room    -0.233303
Name: median_house_value, dtype: float64

In [20]:
housing = pd.get_dummies(housing, columns=['ocean_proximity'])

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   20640 non-null  float64
 1   latitude                    20640 non-null  float64
 2   housing_median_age          20640 non-null  float64
 3   total_rooms                 20640 non-null  float64
 4   total_bedrooms              20640 non-null  float64
 5   population                  20640 non-null  float64
 6   households                  20640 non-null  float64
 7   median_income               20640 non-null  float64
 8   median_house_value          20640 non-null  float64
 9   rooms_per_house             20640 non-null  float64
 10  bedrooms_per_house          20640 non-null  float64
 11  bedrooms_per_room           20640 non-null  float64
 12  rooms_per_person            20640 non-null  float64
 13  ocean_proximity_<1H OCEAN   206

In [21]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(train_set.info())

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12432 to 3648
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   16512 non-null  float64
 1   latitude                    16512 non-null  float64
 2   housing_median_age          16512 non-null  float64
 3   total_rooms                 16512 non-null  float64
 4   total_bedrooms              16512 non-null  float64
 5   population                  16512 non-null  float64
 6   households                  16512 non-null  float64
 7   median_income               16512 non-null  float64
 8   median_house_value          16512 non-null  float64
 9   rooms_per_house             16512 non-null  float64
 10  bedrooms_per_house          16512 non-null  float64
 11  bedrooms_per_room           16512 non-null  float64
 12  rooms_per_person            16512 non-null  float64
 13  ocean_proximity_<1H OCEAN   16512

In [22]:
train_features = train_set.drop('median_house_value', axis=1)
train_labels = train_set['median_house_value'].copy()

In [23]:
for col in train_features.columns:
    mean = train_features[col].mean()
    std = train_features[col].std()
    train_features[col] = (train_features[col] - mean) / std

In [24]:
train_features.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_house,bedrooms_per_house,bedrooms_per_room,rooms_per_person,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12432,1.717687,-0.982184,-0.923947,-0.09942,0.675396,1.369652,0.481046,-1.166674,-0.767369,0.19117,1.915222,-1.01928,-0.894876,1.471774,-0.015566,-0.353904,-0.382959
10536,0.924225,-1.000979,-1.877763,-0.440193,-0.675851,-0.71967,-0.704407,2.468388,0.759347,0.004936,-0.95878,0.663208,1.117406,-0.679411,-0.015566,-0.353904,-0.382959
9659,-0.797438,2.894264,1.06317,0.129141,0.09288,-0.218198,-0.158681,-0.909927,0.49761,0.405092,-0.250596,0.439089,-0.894876,1.471774,-0.015566,-0.353904,-0.382959
6718,0.69966,-0.700261,0.586262,0.628113,0.262382,-0.080625,0.340045,2.017219,0.380027,-0.136793,-0.798792,0.902325,1.117406,-0.679411,-0.015566,-0.353904,-0.382959
6610,0.729602,-0.676767,1.460594,-0.21577,-0.44189,-0.466715,-0.36496,1.057696,0.241847,-0.232834,-0.78756,0.381639,-0.894876,1.471774,-0.015566,-0.353904,-0.382959


In [25]:
w_init = np.zeros(train_features.shape[1] )
b_init = 0.0

w, b, J_hist, w_hist = gradient_descent(X=train_features,y= train_labels,w_in= w_init, b_in=b_init, cost_function=cost, gradients=gradients, alpha=0.4, num_iters=1000,lambda_=1)


Iteration    0: Cost 148432.97   
Iteration  100: Cost 66684.35   
Iteration  200: Cost 66539.75   
Iteration  300: Cost 66520.35   
Iteration  400: Cost 66517.71   
Iteration  500: Cost 66517.36   
Iteration  600: Cost 66517.31   
Iteration  700: Cost 66517.30   
Iteration  800: Cost 66517.30   
Iteration  900: Cost 66517.30   
Iteration  999: Cost 66517.30   
gradient_descent ran in:3.793572425842285 sec


In [26]:
try_data=train_features.iloc[:10]
try_labels=train_labels.iloc[:10]

print(predict(try_data,w,b))
print(try_labels)

[ 29134.47795198 382200.96087298    593.74333193 407765.44280037
 283788.58809824 155178.90772704 196865.42709322 179428.7651959
 184583.14627356 143907.83918671]
12432    100000.0
10536    397700.0
9659      44000.0
6718     500001.0
6610     300900.0
9082     171900.0
2624     119900.0
17703    177500.0
10139    159500.0
8252     147500.0
Name: median_house_value, dtype: float64


In [27]:
test_labels = test_set['median_house_value'].copy()
test_features = test_set.drop('median_house_value', axis=1)


for col in test_features.columns:
    mean = test_features[col].mean()
    std = test_features[col].std()
    test_features[col] = (test_features[col] - mean) / std


In [46]:
pred_labels=predict(train_features,w,b)

rmse=(((pred_labels-train_labels)**2).sum()/len(pred_labels))**0.5
mae= (((pred_labels-train_labels).abs()).sum()/len(pred_labels))

print(rmse)
print(mae)

66517.29971261448
47771.03000671249


In [44]:
pred_labels=predict(test_features,w,b)

rmse= (((pred_labels-test_labels)**2).sum()/len(pred_labels))**0.5
mae= (((pred_labels-test_labels).abs()).sum()/len(pred_labels))
print(rmse)
print(mae)

66119.617499746
48276.01871513647


In [30]:
print(w)

[-60266.62655304 -62326.28846976  13369.14289714   -437.19000927
   8511.96371559 -22134.59882505  19522.59301558  74601.52974347
  -3417.92191192 -26164.46097322  22119.94699679  38396.06938991
   5690.4066364  -10116.52843141   2902.51141378   1589.17797632
   3981.83359957]


In [43]:
import time
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

lin_reg = LinearRegression()

init_time=time.perf_counter()
lin_reg.fit(train_features, train_labels)
final_time=time.perf_counter()

print('scikit-learn took '+str(final_time-init_time)+' sec')

housing_predictions = lin_reg.predict(train_features)
lin_mse = mean_squared_error(train_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

lin_mae= mean_absolute_error(train_labels, housing_predictions)

lin_r2 = r2_score(train_labels, housing_predictions)


print(lin_rmse)
print(lin_mae)
print(lin_r2)

scikit-learn took 0.02993138599958911 sec
66517.29970735802
47771.05040115144
0.6691052072790604


In [45]:
test_predictions=lin_reg.predict(test_features)
test_mse=mean_squared_error(test_labels, test_predictions)
test_rmse=np.sqrt(test_mse)
test_mae= mean_absolute_error(test_labels, test_predictions)

test_r2 = r2_score(test_labels, test_predictions)


print(test_rmse)
print(test_mae)
print(test_r2)


66119.63679951767
48276.074156452
0.6659916653567507
