In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression


In [2]:
df=pd.read_csv('AmesHousing.csv')
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [4]:
#Removing order and PID as they are identifiers and do not add any predictive power
df.drop(['Order','PID'],axis=1,inplace=True)


In [5]:
#Drop the target variable 
y=df['SalePrice']
X=df.drop('SalePrice',axis=1)


In [6]:
#Split the train and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [7]:
#Categorize the columns
numerical_cols=X.select_dtypes(include=['float64','int']).columns.tolist()
categorical_cols=X.select_dtypes(include=['object']).columns.tolist()

#Remove target from num list if still there
if 'SalePrice' in numerical_cols: 
    numerical_cols.remove('SalePrice')

In [8]:
#Numerical pipeline

num_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

#This code replaces nas with median and scales the data as well
#Categorical pipeline

cat_pipeline=Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#Combine the pipelines

preprocessor=ColumnTransformer([
    ('num',num_pipeline,numerical_cols),
    ('cat',cat_pipeline,categorical_cols)
])

In [9]:
#Final pipeline with model
model_pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('model',LinearRegression())
])

#Fit the model
model_pipeline.fit(X_train,y_train)

In [10]:
# Score model
print("Train Score:", model_pipeline.score(X_train, y_train))
print("Test Score:", model_pipeline.score(X_test, y_test))

Train Score: 0.9394057657338998
Test Score: 0.8906139479164588
