In [72]:
import numpy as np
import pandas as pd

In [73]:
def score(y_truth,y_pred):
    u = ((y_truth-y_pred)**2).sum()
    v = ((y_truth-y_truth.mean())**2).sum()
    return 1-u/v

In [130]:
def pred(x_test,m,c):
    
    y_pred = np.zeros(len(x_test))
    
    for i in range(len(x_test)):
        x = x_test[i]
        
        y_pred[i] = (x*m).sum()+c
    
    return y_pred
    

In [4]:
def cost(x,y,params):
    total_cost = 0
    M = len(x)
    N  = len(x[0])
    
    for i in range(M):
        
        total_cost += (1/M)*((y[i]-(params*x[i]).sum())**2)
        
    return total_cost

In [5]:
def step_gradient(x,y,learning_rate,params):
    
    M = len(x)
    N = len(x[0])
    
    params_slope = np.zeros(N)
    
    for i in range(M):
        
        for j in range(N):
            params_slope[j] += (-2/M)*(y[i]-(params*x[i]).sum()) * x[i][j]
       
            
    new_params = params-learning_rate*params_slope
    
    return new_params


In [6]:
def gd(x,y,learning_rate,num_iterations):
    
    N = len(x[0]) # no. of features (cols)
    
    params = [0]*N
    
    for i in range(num_iterations):
        params = step_gradient(x,y,learning_rate,params)
        print(i," Cost : ",cost(x,y,params))
    
    return params
    

In [144]:
def run_train(x,y):
    learning_rate =0.1
    num_iterations = 100
    
    x = np.append(x, np.ones(len(x)).reshape(-1, 1), axis=1)
    params = gd(x,y,learning_rate,num_iterations)
    return params


In [8]:
def run_test(m,c):
    x_test = np.loadtxt('test.csv',delimiter=',')

    y_pred = pred(x_test,m,c)
    
    return y_pred


In [9]:
def get_scores(params,x_train,x_test,y_train,y_test):
    

    m = params[0:len(params)-1]
    c = params[len(params)-1]


    y1_pred =pred(x_train,m,c)
    train_score = score(y_train,y1_pred)


    y2_pred = pred(x_test,m,c)
    test_score = score(y_test,y2_pred)
    
    return train_score,test_score


In [12]:
from sklearn import preprocessing

In [119]:
from sklearn import model_selection

data = np.loadtxt('train.csv',delimiter=',')
X = data[:,0:len(data[0])-1]
Y = data[:,len(data[0])-1]

X_train ,X_test ,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.3)

# train without feature scaling

params = run_train(X_train,Y_train)
train_score,test_score = get_scores(params,X_train,X_test,Y_train,Y_test)
print("train score : ", train_score," ********* test_score : ", test_score)



0  Cost :  470.18389657187873
1  Cost :  379.01194156389033
2  Cost :  309.02668843283504
3  Cost :  253.45512906268243
4  Cost :  208.99632531293463
5  Cost :  173.33731983198294
6  Cost :  144.69070312113263
7  Cost :  121.64579598914203
8  Cost :  103.08312483191156
9  Cost :  88.11209026828281
10  Cost :  76.02286408148642
11  Cost :  66.24877181281107
12  Cost :  58.336741265483454
13  Cost :  51.924022040482136
14  Cost :  46.719801723513825
15  Cost :  42.49065711835591
16  Cost :  39.04901708810501
17  Cost :  36.243996240455274
18  Cost :  33.954099376920865
19  Cost :  32.081405339019206
20  Cost :  30.546923159006138
21  Cost :  29.286878950854636
22  Cost :  28.24974307687776
23  Cost :  27.39384708411926
24  Cost :  26.685471233759195
25  Cost :  26.097308071374073
26  Cost :  25.607226886448988
27  Cost :  25.197279228352254
28  Cost :  24.852897767123935
29  Cost :  24.562250397210043
30  Cost :  24.315719115006736
31  Cost :  24.105479273914746
32  Cost :  23.9251596600

270  Cost :  21.76339146163688
271  Cost :  21.763127697343382
272  Cost :  21.762867518870856
273  Cost :  21.762610874637623
274  Cost :  21.762357713899753
275  Cost :  21.762107986734193
276  Cost :  21.761861644022698
277  Cost :  21.76161863743594
278  Cost :  21.761378919417968
279  Cost :  21.761142443171288
280  Cost :  21.76090916264203
281  Cost :  21.760679032505763
282  Cost :  21.760452008153386
283  Cost :  21.76022804567755
284  Cost :  21.76000710185932
285  Cost :  21.75978913415518
286  Cost :  21.759574100684386
287  Cost :  21.759361960216488
288  Cost :  21.759152672159384
289  Cost :  21.758946196547317
290  Cost :  21.75874249402953
291  Cost :  21.758541525858984
292  Cost :  21.758343253881165
293  Cost :  21.758147640523543
294  Cost :  21.75795464878496
295  Cost :  21.757764242225424
296  Cost :  21.757576384956042
297  Cost :  21.757391041629216
298  Cost :  21.757208177428993
299  Cost :  21.757027758061927
300  Cost :  21.756849749747474
301  Cost :  21.

540  Cost :  21.744095298151485
541  Cost :  21.744087955806798
542  Cost :  21.74408070959427
543  Cost :  21.7440735582551
544  Cost :  21.744066500546893
545  Cost :  21.744059535243558
546  Cost :  21.74405266113513
547  Cost :  21.744045877027364
548  Cost :  21.74403918174179
549  Cost :  21.74403257411533
550  Cost :  21.7440260530001
551  Cost :  21.744019617263277
552  Cost :  21.74401326578694
553  Cost :  21.744006997467704
554  Cost :  21.744000811216704
555  Cost :  21.74399470595935
556  Cost :  21.74398868063506
557  Cost :  21.74398273419721
558  Cost :  21.743976865612865
559  Cost :  21.743971073862593
560  Cost :  21.743965357940333
561  Cost :  21.74395971685327
562  Cost :  21.743954149621405
563  Cost :  21.74394865527781
564  Cost :  21.74394323286805
565  Cost :  21.743937881450247
566  Cost :  21.74393260009491
567  Cost :  21.743927387884607
568  Cost :  21.743922243914017
569  Cost :  21.743917167289656
570  Cost :  21.743912157129675
571  Cost :  21.74390721

813  Cost :  21.743549833342076
814  Cost :  21.74354963225042
815  Cost :  21.74354943379136
816  Cost :  21.743549237930452
817  Cost :  21.74354904463365
818  Cost :  21.743548853867406
819  Cost :  21.743548665598603
820  Cost :  21.743548479794512
821  Cost :  21.743548296422865
822  Cost :  21.74354811545185
823  Cost :  21.743547936850046
824  Cost :  21.743547760586402
825  Cost :  21.743547586630324
826  Cost :  21.743547414951582
827  Cost :  21.743547245520418
828  Cost :  21.743547078307355
829  Cost :  21.743546913283364
830  Cost :  21.743546750419778
831  Cost :  21.743546589688346
832  Cost :  21.74354643106112
833  Cost :  21.74354627451059
834  Cost :  21.743546120009558
835  Cost :  21.74354596753114
836  Cost :  21.743545817048943
837  Cost :  21.743545668536783
838  Cost :  21.743545521968862
839  Cost :  21.743545377319737
840  Cost :  21.743545234564294
841  Cost :  21.74354509367775
842  Cost :  21.74354495463565
843  Cost :  21.74354481741379
844  Cost :  21.74

1083  Cost :  21.743534910601
1084  Cost :  21.743534904871076
1085  Cost :  21.74353489921616
1086  Cost :  21.743534893635292
1087  Cost :  21.743534888127463
1088  Cost :  21.743534882691762
1089  Cost :  21.74353487732719
1090  Cost :  21.74353487203288
1091  Cost :  21.743534866807874
1092  Cost :  21.74353486165126
1093  Cost :  21.743534856562142
1094  Cost :  21.743534851539696
1095  Cost :  21.743534846582957
1096  Cost :  21.74353484169115
1097  Cost :  21.74353483686336
1098  Cost :  21.74353483209876
1099  Cost :  21.743534827396548
1100  Cost :  21.743534822755887
1101  Cost :  21.743534818176006
1102  Cost :  21.743534813656073
1103  Cost :  21.7435348091953
1104  Cost :  21.74353480479293
1105  Cost :  21.743534800448188
1106  Cost :  21.743534796160326
1107  Cost :  21.743534791928624
1108  Cost :  21.743534787752296
1109  Cost :  21.743534783630647
1110  Cost :  21.743534779562967
1111  Cost :  21.74353477554853
1112  Cost :  21.743534771586646
1113  Cost :  21.7435347

1353  Cost :  21.743534485390867
1354  Cost :  21.74353448522763
1355  Cost :  21.743534485066487
1356  Cost :  21.743534484907478
1357  Cost :  21.74353448475053
1358  Cost :  21.74353448459563
1359  Cost :  21.74353448444278
1360  Cost :  21.74353448429192
1361  Cost :  21.743534484143016
1362  Cost :  21.7435344839961
1363  Cost :  21.743534483851086
1364  Cost :  21.743534483707972
1365  Cost :  21.74353448356674
1366  Cost :  21.74353448342734
1367  Cost :  21.74353448328981
1368  Cost :  21.743534483154036
1369  Cost :  21.743534483020042
1370  Cost :  21.743534482887817
1371  Cost :  21.743534482757315
1372  Cost :  21.74353448262852
1373  Cost :  21.743534482501406
1374  Cost :  21.74353448237598
1375  Cost :  21.743534482252173
1376  Cost :  21.743534482130002
1377  Cost :  21.74353448200942
1378  Cost :  21.74353448189042
1379  Cost :  21.74353448177299
1380  Cost :  21.74353448165706
1381  Cost :  21.743534481542678
1382  Cost :  21.74353448142982
1383  Cost :  21.7435344813

In [126]:

params = run_train(X_train,Y_train)
train_score,test_score = get_scores(params,X_train,X_test,Y_train,Y_test)
print("train score : ", train_score," ********* test_score : ", test_score)

0  Cost :  370.6379255764444
1  Cost :  241.51032308209676
2  Cost :  162.03788992627602
3  Cost :  112.14582836578347
4  Cost :  80.57902379598704
5  Cost :  60.481966095013036
6  Cost :  47.61056592395741
7  Cost :  39.31590908079784
8  Cost :  33.93420977763237
9  Cost :  30.414879883143566
10  Cost :  28.091500688820133
11  Cost :  26.539647113662287
12  Cost :  25.48803044018831
13  Cost :  24.762649401480065
14  Cost :  24.25152078028014
15  Cost :  23.882316758418032
16  Cost :  23.608137212980115
17  Cost :  23.398430810766744
18  Cost :  23.23318642789274
19  Cost :  23.099208029111704
20  Cost :  22.987720341444493
21  Cost :  22.89282651334105
22  Cost :  22.81051234861816
23  Cost :  22.738001856101896
24  Cost :  22.67333902572263
25  Cost :  22.615115549025294
26  Cost :  22.56229287316766
27  Cost :  22.51408535681687
28  Cost :  22.46988309738265
29  Cost :  22.429200587469072
30  Cost :  22.39164224481828
31  Cost :  22.356879010716334
32  Cost :  22.324632246097014
33

274  Cost :  21.74402074290887
275  Cost :  21.744008052469603
276  Cost :  21.74399569323346
277  Cost :  21.743983656555567
278  Cost :  21.7439719340167
279  Cost :  21.743960517417424
280  Cost :  21.743949398772468
281  Cost :  21.743938570304934
282  Cost :  21.74392802444105
283  Cost :  21.74391775380472
284  Cost :  21.743907751212454
285  Cost :  21.743898009668197
286  Cost :  21.743888522358635
287  Cost :  21.743879282648265
288  Cost :  21.743870284074838
289  Cost :  21.74386152034475
290  Cost :  21.74385298532873
291  Cost :  21.743844673057456
292  Cost :  21.743836577717488
293  Cost :  21.74382869364707
294  Cost :  21.74382101533228
295  Cost :  21.743813537403124
296  Cost :  21.743806254629728
297  Cost :  21.743799161918815
298  Cost :  21.743792254309987
299  Cost :  21.743785526972314
300  Cost :  21.743778975200996
301  Cost :  21.743772594414047
302  Cost :  21.743766380148998
303  Cost :  21.74376032805995
304  Cost :  21.7437544339144
305  Cost :  21.74374

In [102]:
def train_and_test_algorithm():
    
    # splitting data in train.csv for (algorithm testing)
    
    from sklearn import model_selection

    data = np.loadtxt('train.csv',delimiter=',')
    X = data[:,0:len(data[0])-1]
    Y = data[:,len(data[0])-1]

    X_train ,X_test ,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.3)
    
    # train without feature scaling
    
    params = run_train(X_train,Y_train)
    
    
    train_score,test_score = get_scores(params,X_train,X_test,Y_train,Y_test)
    
    print("***************************")
    
    # train with feature scaling  (to see if we get better predictions)
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    params = run_train(X_train_scaled,Y_train)
    
    scaled_train_score,scaled_test_score = get_scores(params,X_train_scaled,X_test_scaled,Y_train,Y_test)
    
    
    # comparing scores of training with and without feature scaling
    
    print("train score : ", train_score," ********* test_score : ", test_score)
    
    print("scaled train score : ", scaled_train_score,"******** scaled test_score : ", scaled_test_score)


In [103]:
train_and_test_algorithm()

0  Cost :  488.3403032189098
1  Cost :  394.3972364876695
2  Cost :  321.92370082384764
3  Cost :  264.3220645661006
4  Cost :  218.25990882066367
5  Cost :  181.3467041947057
6  Cost :  151.72190209312933
7  Cost :  127.91467757810166
8  Cost :  108.75777919683738
9  Cost :  93.32311679358386
10  Cost :  80.87163132498723
11  Cost :  70.81395421389546
12  Cost :  62.67944636888842
13  Cost :  56.09178574704552
14  Cost :  50.749685467650565
15  Cost :  46.41163919897625
16  Cost :  42.883833417406116
17  Cost :  40.010554270384944
18  Cost :  37.66656281293709
19  Cost :  35.75102596296132
20  Cost :  34.18267901128708
21  Cost :  32.89596459441311
22  Cost :  31.837947049864624
23  Cost :  30.965843385493763
24  Cost :  30.24504529826706
25  Cost :  29.647532780883246
26  Cost :  29.15060040994907
27  Cost :  28.735833623565952
28  Cost :  28.388285106931498
29  Cost :  28.09581154250432
30  Cost :  27.84853901655561
31  Cost :  27.638431752246834
32  Cost :  27.458943909798712
33  C

75  Cost :  25.830238976285752
76  Cost :  25.820352548532274
77  Cost :  25.810775000930562
78  Cost :  25.801495286982775
79  Cost :  25.792502851358627
80  Cost :  25.78378759588852
81  Cost :  25.775339849555248
82  Cost :  25.76715034184871
83  Cost :  25.75921017896066
84  Cost :  25.751510822387758
85  Cost :  25.744044069587254
86  Cost :  25.736802036389932
87  Cost :  25.72977714092628
88  Cost :  25.72296208886185
89  Cost :  25.716349859772762
90  Cost :  25.70993369451892
91  Cost :  25.70370708349641
92  Cost :  25.697663755667886
93  Cost :  25.69179766828698
94  Cost :  25.686102997243985
95  Cost :  25.68057412797191
96  Cost :  25.67520564685946
97  Cost :  25.669992333126558
98  Cost :  25.664929151122255
99  Cost :  25.660011243011237
100  Cost :  25.655233921819114
101  Cost :  25.650592664809864
102  Cost :  25.646083107172373
103  Cost :  25.64170103599549
104  Cost :  25.63744238451254
105  Cost :  25.63330322659935
106  Cost :  25.62927977151038
107  Cost :  25

# ** analysis : - On applying feature scaling their is either infinitely small improvement in predictions or prediction remain same  **



# training on train.csv dataset and testing on test.csv with feature scaling

In [140]:
data = np.loadtxt("train.csv",delimiter=",")

x_train = data[:,0:len(data[0])-1]
y_train = data[:,len(data[0])-1]



x_test = np.loadtxt("test.csv",delimiter=",") 


# applying feature scaling
scaler = preprocessing.StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)


param = run_train(x_train_scaled,y_train)  

0  Cost :  469.02446650514304
1  Cost :  380.92989505635626
2  Cost :  312.4240324605638
3  Cost :  257.49642447661637
4  Cost :  213.17698851621807
5  Cost :  177.3639160776441
6  Cost :  148.408732497813
7  Cost :  124.98958743086321
8  Cost :  106.04159079877762
9  Cost :  90.70584175442674
10  Cost :  78.28923930110103
11  Cost :  68.23228452370836
12  Cost :  60.083193063818904
13  Cost :  53.477058144362545
14  Cost :  48.11906882274004
15  Cost :  43.770987933478295
16  Cost :  40.24025201887581
17  Cost :  37.371181475144304
18  Cost :  35.037889883311756
19  Cost :  33.13856218594673
20  Cost :  31.590836061308536
21  Cost :  30.32807274985738
22  Cost :  29.29634526193516
23  Cost :  28.452005378322973
24  Cost :  27.759717773901304
25  Cost :  27.190871248143736
26  Cost :  26.722294473696603
27  Cost :  26.33521770747713
28  Cost :  26.014433213796536
29  Cost :  25.747616260160655
30  Cost :  25.52477589222711
31  Cost :  25.337810619067465
32  Cost :  25.180148919917944
3

In [141]:
# extracting m and c from param
m = param[:len(param)-1]              
c = param[len(param)-1]

# prediction for data in "test.csv" ---> (final predictions)
y_test_pred = pred(x_test_scaled,m,c)

In [143]:
np.savetxt('boston_predictions.csv',y_test_pred,fmt="%.8f")