In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df = pd.read_csv('./datasets/train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [4]:
correl = df.corr()
correl['SalePrice'].sort_values(ascending=False)

SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
Garage Yr Blt      0.533922
Mas Vnr Area       0.512230
TotRms AbvGrd      0.504014
Fireplaces         0.471093
BsmtFin SF 1       0.423519
Lot Frontage       0.341842
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Area           0.296566
Bsmt Full Bath     0.283662
Half Bath          0.283001
2nd Flr SF         0.248452
Bsmt Unf SF        0.190210
Bedroom AbvGr      0.137067
Screen Porch       0.134581
3Ssn Porch         0.048732
Mo Sold            0.032735
Pool Area          0.023106
BsmtFin SF 2       0.016255
Misc Val          -0.007375
Yr Sold           -0.015203
Low Qual Fin SF   -0.041594
Bsmt Half Bath    -0.045328
Id                -0.051398
MS SubClass       -0.087335
Overall Cond      -0

In [6]:
df['Total SF'] = df['1st Flr SF'] + df['BsmtFin SF 1'] + df['2nd Flr SF'] + df['Gr Liv Area']

In [7]:
finalpoke = ['SalePrice', 'Overall Qual', 'Garage Area', 'Total SF', 'Year Built', 'Year Remod/Add', 'Full Bath', 'TotRms AbvGrd', 'Fireplaces']

In [8]:
finalize = df[finalpoke]

In [9]:
finalize.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SalePrice       2051 non-null   int64  
 1   Overall Qual    2051 non-null   int64  
 2   Garage Area     2050 non-null   float64
 3   Total SF        2050 non-null   float64
 4   Year Built      2051 non-null   int64  
 5   Year Remod/Add  2051 non-null   int64  
 6   Full Bath       2051 non-null   int64  
 7   TotRms AbvGrd   2051 non-null   int64  
 8   Fireplaces      2051 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 144.3 KB


In [10]:
finalize.isnull().sum()

SalePrice         0
Overall Qual      0
Garage Area       1
Total SF          1
Year Built        0
Year Remod/Add    0
Full Bath         0
TotRms AbvGrd     0
Fireplaces        0
dtype: int64

In [13]:
finalize = finalize.apply(lambda x: x.fillna(x.mean()),axis=0)

In [14]:
finalize.isnull().sum()

SalePrice         0
Overall Qual      0
Garage Area       0
Total SF          0
Year Built        0
Year Remod/Add    0
Full Bath         0
TotRms AbvGrd     0
Fireplaces        0
dtype: int64

In [15]:
target = finalize['SalePrice']

In [19]:
finalize.loc[(finalize['SalePrice'] > 580000)]

Unnamed: 0,SalePrice,Overall Qual,Garage Area,Total SF,Year Built,Year Remod/Add,Full Bath,TotRms AbvGrd,Fireplaces
1671,611657,9,820.0,6916.0,2009,2010,2,11,2
1692,584500,9,959.0,7292.0,1993,1994,3,11,1
1796,582933,9,1020.0,5644.0,2008,2009,3,12,1
1964,591587,9,1110.0,5777.0,2006,2007,2,8,2


In [20]:
finalize.loc[(finalize['Overall Qual'] > 9)]

Unnamed: 0,SalePrice,Overall Qual,Garage Area,Total SF,Year Built,Year Remod/Add,Full Bath,TotRms AbvGrd,Fireplaces
41,465000,10,850.0,5538.0,2006,2006,2,7,1
138,545224,10,758.0,6780.0,2006,2006,3,10,1
199,386250,10,968.0,4840.0,2003,2003,2,8,1
201,310000,10,812.0,4779.0,2008,2008,2,10,1
332,460000,10,1150.0,6662.0,2005,2006,2,10,1
517,392000,10,932.0,3648.0,2006,2007,2,8,1
519,345000,10,846.0,4914.0,1998,1999,2,7,1
529,451950,10,842.0,5930.0,2008,2008,2,10,1
622,337500,10,786.0,4608.0,2006,2007,2,7,1
623,552000,10,949.0,7130.0,2004,2005,2,8,1


In [21]:
finalize.drop('SalePrice',axis=1,inplace=True)

In [22]:
lr = LinearRegression()

In [23]:
lr.fit(finalize,target)

LinearRegression()

In [25]:
preds = lr.predict(finalize)

In [26]:
rmse = np.sqrt(mean_squared_error(target,preds))
rmse

35618.80608117907

In [27]:
pd.get_dummies('Neighborhood')

Unnamed: 0,Neighborhood
0,1


In [32]:
df = pd.get_dummies(df['Neighborhood'])

In [33]:
df.shape

(2051, 28)

In [34]:
df.Blueste

0       0
1       0
2       0
3       0
4       0
       ..
2046    0
2047    0
2048    0
2049    0
2050    0
Name: Blueste, Length: 2051, dtype: uint8