# Linear Regression Practice

Adapted from http://www.ritchieng.com/machine-learning-linear-regression/

## 1. Reading Data using Panda

In [1]:
# conventional way to import pandas
import pandas as pd
import numpy as np

In [2]:
# Reading train and test csv files, merging after that
data = pd.read_csv('data/train.csv', index_col=0)
Y2=data['SalePrice']
data=data.drop(columns=['SalePrice'])
tdata = pd.read_csv('data/test.csv', index_col=0)
frames = [data, tdata]
adata = pd.concat(frames)
adata.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


## 2. Filling lost data

In [3]:
# Searching at each parameter for Nan data 
for var in adata:
        if adata[var].dtype!='O':
            # For int and floats we use the mean
            adata[var].fillna((adata[var].mean()), inplace=True)
        else:
            # For string data we use the mode
            adata[var].fillna((adata[var].mode()[0]), inplace=True)
adata_m=adata
adata_m.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Ex,MnPrv,Shed,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,0,Ex,MnPrv,Shed,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,0,Ex,MnPrv,Shed,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,0,Ex,MnPrv,Shed,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,0,Ex,MnPrv,Shed,0,12,2008,WD,Normal


## 3. Encoding categorical variables and normalization

### 3.1 Encoding categorical variables

In [4]:
# Creating a list with all the categorical variables in the dataset
categorical_var=[]
for var in adata_m:
    if adata_m[var].dtype=='O':
        categorical_var.append(var)
# Using the function get_dummies we use the list "categorical_var" and encode the categorical variables.
adata_m = pd.get_dummies( adata_m, columns = categorical_var )   
adata_m.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,0,1,0,0,0,0,1,0
4,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,1,1,0,0,0,0,0
5,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,1,0,0,0,0,1,0


### 3.2 Normalization

In [5]:
for var in adata_m:
    adata_m[var]=(adata_m[var]-adata_m[var].min())/(adata_m[var].max()-adata_m[var].min())
adata_m.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.235294,0.150685,0.03342,0.666667,0.5,0.949275,0.883333,0.1225,0.125089,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.202055,0.038795,0.555556,0.875,0.753623,0.433333,0.0,0.173281,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.235294,0.160959,0.046507,0.666667,0.5,0.934783,0.866667,0.10125,0.086109,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.294118,0.133562,0.038561,0.666667,0.5,0.311594,0.333333,0.0,0.038271,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.235294,0.215753,0.060576,0.777778,0.5,0.927536,0.833333,0.21875,0.116052,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## 4. Spliting data

### 4.1 Training-Test data

In [6]:
data2 = adata_m.iloc[:data.shape[0],:]
tdata2 = adata_m.iloc[data.shape[0]:,:]

### 4.2 Input-output

Spliting input-output only for training data

In [7]:
Y=(Y2-Y2.min())/(Y2.max()-Y2.min())
Y=Y.iloc[:].values;
Y=np.reshape(Y, (Y.shape[0], 1))
X=data2.iloc[:,:].values;

### 4.2 Training-Testing

In [8]:
ordering= np.arange(X.shape[0])
np.random.shuffle(ordering)

In [9]:
train=ordering[0:int(X.shape[0]*0.8)]
test=ordering[int(X.shape[0]*0.8):X.shape[0]]
X_train=X[train,:]
X_test=X[test,:]
Y_train=Y[train]
Y_test=Y[test]

## 5. Training

In [10]:
############################################
# PENDIENTE
# normal equation : (X'X)^-1.X'Y
############################################

In [11]:
############################################
# BATCH
############################################
alfa=0.0001
epochs=600000
W=np.random.rand(1,X.shape[1])/X.shape[1]
for epoch in range(epochs):
    err=np.transpose(Y_train)-np.matmul(W,np.transpose(X_train))
    dw=alfa*np.matmul(err,X_train)/X_train.shape[0]
    W+=dw
    if epoch%1000==0:
        y_pred=np.matmul(X_test,np.transpose(W))*(Y2.max()-Y2.min())+Y2.min()
        y_test=Y_test*(Y2.max()-Y2.min())+Y2.min()
        err_ep=np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2));
        print('Epoca: ',epoch)
        print('--------------')
        print('error: ',err_ep)
print(W)


Epoca:  0
--------------
error:  0.6717293620129493
Epoca:  1000
--------------
error:  0.37037019761539436
Epoca:  2000
--------------
error:  0.3419048061898878
Epoca:  3000
--------------
error:  0.31838638132732144
Epoca:  4000
--------------
error:  0.3001838253154127
Epoca:  5000
--------------
error:  0.28602210715538
Epoca:  6000
--------------
error:  0.2748595395163276
Epoca:  7000
--------------
error:  0.2659189041561681
Epoca:  8000
--------------
error:  0.25862898948833823
Epoca:  9000
--------------
error:  0.2525738903843273
Epoca:  10000
--------------
error:  0.2474525345918254
Epoca:  11000
--------------
error:  0.24304735971369892
Epoca:  12000
--------------
error:  0.23920077322939434
Epoca:  13000
--------------
error:  0.23579789703533713
Epoca:  14000
--------------
error:  0.232754158822827
Epoca:  15000
--------------
error:  0.23000648941241175
Epoca:  16000
--------------
error:  0.2275071321374391
Epoca:  17000
--------------
error:  0.22521930852963098


Epoca:  148000
--------------
error:  0.1767840924279436
Epoca:  149000
--------------
error:  0.17669635266464223
Epoca:  150000
--------------
error:  0.1766095944554301
Epoca:  151000
--------------
error:  0.17652380549467478
Epoca:  152000
--------------
error:  0.17643897366350694
Epoca:  153000
--------------
error:  0.17635508702605088
Epoca:  154000
--------------
error:  0.17627213382579407
Epoca:  155000
--------------
error:  0.17619010248208544
Epoca:  156000
--------------
error:  0.17610898158675536
Epoca:  157000
--------------
error:  0.1760287599008473
Epoca:  158000
--------------
error:  0.17594942635145752
Epoca:  159000
--------------
error:  0.1758709700286736
Epoca:  160000
--------------
error:  0.17579338018260587
Epoca:  161000
--------------
error:  0.17571664622050803
Epoca:  162000
--------------
error:  0.1756407577039803
Epoca:  163000
--------------
error:  0.17556570434625082
Epoca:  164000
--------------
error:  0.17549147600953086
Epoca:  165000
----

Epoca:  292000
--------------
error:  0.17037232786681
Epoca:  293000
--------------
error:  0.17035396895431587
Epoca:  294000
--------------
error:  0.17033581670284462
Epoca:  295000
--------------
error:  0.17031786916329572
Epoca:  296000
--------------
error:  0.17030012440712852
Epoca:  297000
--------------
error:  0.17028258052609466
Epoca:  298000
--------------
error:  0.17026523563197296
Epoca:  299000
--------------
error:  0.17024808785631088
Epoca:  300000
--------------
error:  0.17023113535016768
Epoca:  301000
--------------
error:  0.1702143762838629
Epoca:  302000
--------------
error:  0.17019780884672883
Epoca:  303000
--------------
error:  0.17018143124686566
Epoca:  304000
--------------
error:  0.17016524171090197
Epoca:  305000
--------------
error:  0.17014923848375726
Epoca:  306000
--------------
error:  0.1701334198284098
Epoca:  307000
--------------
error:  0.17011778402566652
Epoca:  308000
--------------
error:  0.17010232937393738
Epoca:  309000
----

Epoca:  437000
--------------
error:  0.1691634097563877
Epoca:  438000
--------------
error:  0.16916162978233498
Epoca:  439000
--------------
error:  0.16915990382357338
Epoca:  440000
--------------
error:  0.16915823136255462
Epoca:  441000
--------------
error:  0.1691566118858982
Epoca:  442000
--------------
error:  0.16915504488435248
Epoca:  443000
--------------
error:  0.1691535298527574
Epoca:  444000
--------------
error:  0.16915206629000631
Epoca:  445000
--------------
error:  0.16915065369900933
Epoca:  446000
--------------
error:  0.16914929158665576
Epoca:  447000
--------------
error:  0.16914797946377877
Epoca:  448000
--------------
error:  0.16914671684511876
Epoca:  449000
--------------
error:  0.169145503249288
Epoca:  450000
--------------
error:  0.16914433819873592
Epoca:  451000
--------------
error:  0.16914322121971334
Epoca:  452000
--------------
error:  0.1691421518422393
Epoca:  453000
--------------
error:  0.16914112960006644
Epoca:  454000
-----

Epoca:  579000
--------------
error:  0.16926356877027307
Epoca:  580000
--------------
error:  0.1692657494598535
Epoca:  581000
--------------
error:  0.16926793956145758
Epoca:  582000
--------------
error:  0.1692701389015732
Epoca:  583000
--------------
error:  0.16927234730800306
Epoca:  584000
--------------
error:  0.1692745646098549
Epoca:  585000
--------------
error:  0.16927679063753215
Epoca:  586000
--------------
error:  0.16927902522272467
Epoca:  587000
--------------
error:  0.16928126819840003
Epoca:  588000
--------------
error:  0.16928351939879369
Epoca:  589000
--------------
error:  0.16928577865940084
Epoca:  590000
--------------
error:  0.16928804581696666
Epoca:  591000
--------------
error:  0.16929032070947822
Epoca:  592000
--------------
error:  0.16929260317615508
Epoca:  593000
--------------
error:  0.16929489305744094
Epoca:  594000
--------------
error:  0.16929719019499498
Epoca:  595000
--------------
error:  0.16929949443168302
Epoca:  596000
--

In [12]:
y_pred=np.matmul(X_test,np.transpose(W))*(Y2.max()-Y2.min())+Y2.min()
y_test=Y_test*(Y2.max()-Y2.min())+Y2.min()

In [13]:
##### RMSLE-KAGGLE
np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

0.1693111143453353

## 6. TESTEO

In [14]:
tdata2.shape

(1459, 288)

In [15]:
X_send=tdata2.iloc[:,:].values;
X_send.shape

(1459, 288)

In [16]:
Y_send=np.matmul(X_send,np.transpose(W))*(Y2.max()-Y2.min())+Y2.min()
Y_send.shape

(1459, 1)

In [17]:
df = pd.DataFrame(Y_send,columns = ["SalePrice"])
df.index = np.arange(1461, 1461+len(df))
df.to_csv("outputs/predictions2.csv")