# Prediction of House Prices using advanced regression techniques

## Data Manipulation

In [1]:
import pandas as pd

#train data
df_house_prices_train=pd.read_csv(r"..\01_Data\train.csv")
print(df_house_prices_train.shape)
print(df_house_prices_train.isnull().sum())

print("*"*50)

#test data
df_house_prices_test=pd.read_csv(r"..\01_Data\test.csv")
print(df_house_prices_test.shape)
print(df_house_prices_test.isnull().sum())


(1460, 81)
Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64
**************************************************
(1459, 80)
Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64


In [2]:
def drop_columns_df(df,*columns):
    for column in columns:
        df=df.drop(column,axis=1)
    return df

#train
df_house_prices_train=drop_columns_df(df_house_prices_train,"Id","Alley","PoolQC","Fence","MiscFeature")
print(df_house_prices_train.shape)
print(df_house_prices_train.info())

print("*"*50)

#test
df_house_prices_test=drop_columns_df(df_house_prices_test,"Id","Alley","PoolQC","Fence","MiscFeature")
print(df_house_prices_test.shape)
print(df_house_prices_test.info())

(1460, 76)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 1

In [3]:
def replace_null_by_mean(df,*columns):
    for column in columns:
        df[column]=df[column].fillna(df[column].mean())
    return df

#train
df_house_prices_train=replace_null_by_mean(df_house_prices_train,"LotFrontage","MasVnrArea","GarageYrBlt")
print(df_house_prices_train.info())

print("*"*50)

#test
df_house_prices_test=replace_null_by_mean(df_house_prices_test,"LotFrontage","MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","BsmtHalfBath","GarageYrBlt","GarageCars","GarageArea")
print(df_house_prices_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [4]:
def fill_null_by_mode(df,*columns):
    for column in columns:
        df[column]=df[column].fillna(df[column].mode()[0])
    return df

#train
df_house_prices_train=fill_null_by_mode(df_house_prices_train,"MasVnrType","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","Electrical","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond")
df_house_prices_train.info()

print("*"*50)

#test
df_house_prices_test=fill_null_by_mode(df_house_prices_test,"Utilities","Exterior1st","Exterior2nd","MasVnrType","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","KitchenQual","Functional","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","SaleType")
df_house_prices_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [5]:
#train
y_train=df_house_prices_train["SalePrice"]
X_train=df_house_prices_train.drop("SalePrice",axis=1)
print(X_train.shape)
print(y_train.shape)

#test
X_test=df_house_prices_test
print(X_test.shape)

(1460, 75)
(1460,)
(1459, 75)


In [6]:
import numpy as np

#train
X_train=pd.get_dummies(X_train)
print(X_train.shape)

#test
X_test=pd.get_dummies(X_test)
print(X_test.shape)

(1460, 275)
(1459, 259)


# Data Modeling

In [7]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [8]:
model.predict(X_test)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 275 is different from 259)