In [187]:
import pandas as pd
import time
from linear_regression_no_numpy import *

In [163]:
housing=pd.read_csv('housing.csv')
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [164]:
median_of_total_bedrooms=housing['total_bedrooms'].median()
housing['total_bedrooms'] = housing['total_bedrooms'].fillna(median_of_total_bedrooms)

In [165]:
corr_matrix = housing.select_dtypes(include=['number']).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049457
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [166]:
housing['rooms_per_house']=housing['total_rooms']/housing['households']
housing['bedrooms_per_house']=housing['total_bedrooms']/housing['households']
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["rooms_per_person"] = housing["total_rooms"]/housing["population"]

corr_matrix = housing.select_dtypes(include=['number']).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
rooms_per_person      0.209482
rooms_per_house       0.151948
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049457
population           -0.024650
bedrooms_per_house   -0.045637
longitude            -0.045967
latitude             -0.144160
bedrooms_per_room    -0.233303
Name: median_house_value, dtype: float64

In [167]:
housing = pd.get_dummies(housing, columns=['ocean_proximity'])

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   20640 non-null  float64
 1   latitude                    20640 non-null  float64
 2   housing_median_age          20640 non-null  float64
 3   total_rooms                 20640 non-null  float64
 4   total_bedrooms              20640 non-null  float64
 5   population                  20640 non-null  float64
 6   households                  20640 non-null  float64
 7   median_income               20640 non-null  float64
 8   median_house_value          20640 non-null  float64
 9   rooms_per_house             20640 non-null  float64
 10  bedrooms_per_house          20640 non-null  float64
 11  bedrooms_per_room           20640 non-null  float64
 12  rooms_per_person            20640 non-null  float64
 13  ocean_proximity_<1H OCEAN   206

In [168]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(train_set.info())

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12432 to 3648
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   16512 non-null  float64
 1   latitude                    16512 non-null  float64
 2   housing_median_age          16512 non-null  float64
 3   total_rooms                 16512 non-null  float64
 4   total_bedrooms              16512 non-null  float64
 5   population                  16512 non-null  float64
 6   households                  16512 non-null  float64
 7   median_income               16512 non-null  float64
 8   median_house_value          16512 non-null  float64
 9   rooms_per_house             16512 non-null  float64
 10  bedrooms_per_house          16512 non-null  float64
 11  bedrooms_per_room           16512 non-null  float64
 12  rooms_per_person            16512 non-null  float64
 13  ocean_proximity_<1H OCEAN   16512

In [169]:
train_features = train_set.drop('median_house_value', axis=1)
train_labels = train_set['median_house_value'].copy()

In [170]:
for col in train_features.columns:
    mean = train_features[col].mean()
    std = train_features[col].std()
    train_features[col] = (train_features[col] - mean) / std

In [171]:
train_features.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_house,bedrooms_per_house,bedrooms_per_room,rooms_per_person,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12432,1.717687,-0.982184,-0.923947,-0.09942,0.675396,1.369652,0.481046,-1.166674,-0.767369,0.19117,1.915222,-1.01928,-0.894876,1.471774,-0.015566,-0.353904,-0.382959
10536,0.924225,-1.000979,-1.877763,-0.440193,-0.675851,-0.71967,-0.704407,2.468388,0.759347,0.004936,-0.95878,0.663208,1.117406,-0.679411,-0.015566,-0.353904,-0.382959
9659,-0.797438,2.894264,1.06317,0.129141,0.09288,-0.218198,-0.158681,-0.909927,0.49761,0.405092,-0.250596,0.439089,-0.894876,1.471774,-0.015566,-0.353904,-0.382959
6718,0.69966,-0.700261,0.586262,0.628113,0.262382,-0.080625,0.340045,2.017219,0.380027,-0.136793,-0.798792,0.902325,1.117406,-0.679411,-0.015566,-0.353904,-0.382959
6610,0.729602,-0.676767,1.460594,-0.21577,-0.44189,-0.466715,-0.36496,1.057696,0.241847,-0.232834,-0.78756,0.381639,-0.894876,1.471774,-0.015566,-0.353904,-0.382959
9082,0.69467,-0.437132,-1.32137,1.03189,0.634811,0.67469,0.58288,0.151528,0.542489,0.023173,-0.754548,0.229311,-0.894876,1.471774,-0.015566,-0.353904,-0.382959
2624,-2.224671,2.396199,-0.049615,0.402771,0.231347,0.204282,0.253878,-0.035865,0.188365,-0.070247,-0.509974,0.130559,1.117406,-0.679411,-0.015566,-0.353904,-0.382959
17703,-1.126799,0.793934,-0.526523,-0.272796,-0.322522,0.441261,-0.187403,-0.087959,-0.264948,-0.30542,-0.261667,-0.801523,1.117406,-0.679411,-0.015566,-0.353904,-0.382959
10139,0.834399,-0.798934,-1.639309,-0.237845,0.016484,-0.130329,-0.080347,-0.200647,-0.36865,0.115477,0.656888,-0.278482,1.117406,-0.679411,-0.015566,-0.353904,-0.382959
8252,0.69467,-0.864716,1.22214,-0.125633,0.498731,1.558703,0.447101,-1.028176,-0.774087,0.025089,1.560614,-1.083137,-0.894876,-0.679411,-0.015566,-0.353904,2.611086


In [194]:
w_init = [0.0] * train_features.shape[1] 
b_init = 0.0

X = train_features.values.tolist()  # Convert DataFrame to list of lists
y = train_labels.values.tolist()    # Convert Series to list

init_time=time.time()
w, b, J_hist, w_hist = gradient_descent(X=X,y= y,w_in= w_init, b_in=b_init, cost_function=cost, gradients=gradients, alpha=0.4, num_iters=1000,lambda_=1)
final_time=time.time()

print('gradient descent ran in '+str(final_time-init_time)+' sec')

Iteration    0: Cost 148432.97   
Iteration  100: Cost 66684.35   
Iteration  200: Cost 66539.75   
Iteration  300: Cost 66520.35   
Iteration  400: Cost 66517.71   
Iteration  500: Cost 66517.36   
Iteration  600: Cost 66517.31   
Iteration  700: Cost 66517.30   
Iteration  800: Cost 66517.30   
Iteration  900: Cost 66517.30   
Iteration  999: Cost 66517.30   
gradient descent ran in 36.446109771728516 sec


In [173]:
try_data=(train_features.iloc[:10]).values.tolist()
try_labels=(train_labels.iloc[:10]).values.tolist()

def predict(X, w_in, b_in):
    return [dot(w_in, x) + b_in for x in X]

print(predict(try_data,w,b))
print(try_labels)

[34118.486018798925, 383290.6849731575, 36526.37130544873, 411734.38113119226, 276778.23749662575, 150162.37852976623, 205960.604723653, 177453.37134275484, 187068.7197925495, 147424.36117309774]
[100000.0, 397700.0, 44000.0, 500001.0, 300900.0, 171900.0, 119900.0, 177500.0, 159500.0, 147500.0]


In [174]:
test_labels = (test_set['median_house_value'].copy()).values.tolist()
test_features = (test_set.drop('median_house_value', axis=1))


for col in test_features.columns:
    mean = test_features[col].mean()
    std = test_features[col].std()
    test_features[col] = (test_features[col] - mean) / std

test_features = test_features.values.tolist()


In [192]:
pred_labels=predict(X,w,b)

rmse=(sum((pred_labels[i]-y[i])**2 for i in range(len(pred_labels)))/len(pred_labels))**0.5

print(rmse)

66517.29971261448


In [193]:
pred_labels=predict(test_features,w,b)

rmse=(sum((pred_labels[i]-test_labels[i])**2 for i in range(len(pred_labels)))/len(pred_labels))**0.5

print(rmse)

66119.61749974599


In [191]:
print(w)

[-60266.626553042566, -62326.28846975956, 13369.142897138167, -437.1900092659023, 8511.963715591424, -22134.598825054483, 19522.593015577095, 74601.52974347083, -3417.9219119164745, -26164.46097322143, 22119.946996791357, 38396.069389914024, 5690.4066364041855, -10116.528431409459, 2902.51141378032, 1589.1779763214524, 3981.8335995697057]
