In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv")

In [5]:
# EDA (Explorery Data Analysis)
# We are trying to create a regression model that predicts sales price of houses
train.head(5)

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,SalePrice,ExterQual_TA,Foundation_PConc,KitchenQual_TA
0,6,1969,1969,663,663,1352,1,7,1,299,158000,1,0,1
1,6,1920,1950,1012,1012,1012,1,6,1,308,118400,1,0,1
2,5,1910,2006,1022,1022,1022,1,4,1,280,85000,1,0,1
3,5,1973,1973,1656,1656,1656,2,8,2,506,135000,1,0,1
4,6,1978,1978,918,918,1683,2,7,2,440,172500,1,0,1


In [4]:
train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
OverallQual,765.0,6.267974,1.379575,2.0,5.0,6.0,7.0,10.0
YearBuilt,765.0,1972.222222,31.525487,1880.0,1953.0,1976.0,2003.0,2009.0
YearRemodAdd,765.0,1986.067974,20.845557,1950.0,1968.0,1995.0,2005.0,2010.0
TotalBsmtSF,765.0,1095.550327,434.600514,190.0,808.0,1008.0,1324.0,6110.0
1stFlrSF,765.0,1169.427451,399.990712,438.0,879.0,1090.0,1392.0,4692.0
GrLivArea,765.0,1540.55817,545.160103,438.0,1164.0,1477.0,1776.0,5642.0
FullBath,765.0,1.584314,0.564912,0.0,1.0,2.0,2.0,3.0
TotRmsAbvGrd,765.0,6.580392,1.59199,3.0,6.0,6.0,7.0,12.0
GarageCars,765.0,1.871895,0.654671,1.0,1.0,2.0,2.0,4.0
GarageArea,765.0,500.332026,191.179453,160.0,352.0,484.0,595.0,1418.0


In [6]:
test = pd.read_csv("test.csv")

In [8]:
# After the training of model, the accuracy of model will be tested with test data
test.head(5)

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,SalePrice,ExterQual_TA,Foundation_PConc,KitchenQual_TA
0,4,1961,1961,1029,1029,1029,1,5,1,261,118500,1,0,1
1,5,1921,1950,731,820,1343,1,7,1,186,154900,1,0,1
2,7,1998,1998,723,767,767,1,4,1,367,133000,1,1,1
3,4,1955,1955,1005,1005,1005,1,5,2,672,115000,1,0,0
4,6,1963,2003,1059,1068,1068,1,6,1,264,154500,1,0,1


In [10]:
# We have to determine which column is our label(target), which columns are features(öznitelikler)
train.columns

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
       'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea',
       'SalePrice', 'ExterQual_TA', 'Foundation_PConc', 'KitchenQual_TA'],
      dtype='object')

In [11]:
from sklearn.linear_model import LinearRegression

In [13]:
# We have created our Linear Regression Model
# y = x1w1 + x2w2 + x3w3 + x4w4 + ..... + b
model = LinearRegression()

In [16]:
#SalePrice is our target(label) and its a column thats why we put parameter axis=1
X_train = train.drop("SalePrice", axis=1)
y_train = train.loc[:, "SalePrice"]

In [17]:
# Feed the model with data and model will be trained
#X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data.

# y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary.

# sample_weight : array-like of shape (n_samples,), default=None Individual weights for each sample.
model.fit(X_train, y_train)

In [18]:
X_test = test.drop("SalePrice", axis=1)
y_test = test.loc[:, "SalePrice"]

In [19]:
# returns predicted values, makes possible to compare computed output to expected output
predictions = model.predict(X_test)

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
# Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_test, predictions))
rmse

33186.384172367696

In [22]:
# Visualize predicted values and real test values in a Data Frame
comparison = pd.DataFrame({"Actual Values":y_test, "Predictions":predictions})

In [23]:
comparison.head()

Unnamed: 0,Actual Values,Predictions
0,118500,83380.944694
1,154900,105974.149765
2,133000,139238.138343
3,115000,104982.049557
4,154500,140473.360146


In [25]:
# We can see that the model has been improved with the help of training data
comparison.tail(10)

Unnamed: 0,Actual Values,Predictions
319,192000,225985.782264
320,145000,155152.660818
321,186700,184011.901543
322,145250,158336.300928
323,305900,310423.318537
324,132250,102816.796295
325,123000,121698.649065
326,316600,271745.844407
327,142000,131258.275591
328,250000,263005.372419


In [28]:
# Here, we can see the features that have the strongest relation to the target variable. 
# Overall material and finish of the house, ground living area square feet, and garage size in car capacity are the top three features that have the strongest correlation with house sale price. 
# This means they have the biggest impact on predicting it. 
train.corr()["SalePrice"].sort_values(ascending=False).head(10)

SalePrice           1.000000
OverallQual         0.792263
GrLivArea           0.712054
GarageCars          0.658355
GarageArea          0.621354
1stFlrSF            0.621057
TotalBsmtSF         0.612205
FullBath            0.597505
TotRmsAbvGrd        0.573845
Foundation_PConc    0.517222
Name: SalePrice, dtype: float64