# House Prices Data Cleaning


In [48]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns

#Libraries additionally added
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import ast

# added while cleaning
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

#Regular Expressions
import re

# Study data files
hp_train_data_path = "../House_Prices_Data/house_prices_train.csv"
hp_test_data_path = "../House_Prices_Data/house_prices_test.csv"


# Read the House Prices Train & Test data 
hp_train_df = pd.read_csv(hp_train_data_path)
hp_test_df = pd.read_csv(hp_test_data_path)


In [12]:
#Display the House Prices Train DataFrame
hp_train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [13]:
#Display the House Prices Test  DataFrame
hp_test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [14]:
# Checking for null values
hp_train_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [16]:
# dropping the null values
hp_train_df_clean = hp_train_df.copy()


# Impute missing values in numerical columns with the mean
hp_train_df_clean.fillna(hp_train_df_clean.mean(), inplace=True)

# Impute missing values in categorical columns with the mode
hp_train_df_clean.fillna(hp_train_df_clean.mode().iloc[0], inplace=True)

hp_train_df_clean

  hp_train_df_clean.fillna(hp_train_df_clean.mean(), inplace=True)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,Grvl,Reg,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,Grvl,Reg,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,Grvl,Reg,Lvl,AllPub,...,0,Gd,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,Grvl,Reg,Lvl,AllPub,...,0,Gd,MnPrv,Shed,0,4,2010,WD,Normal,142125


In [17]:
hp_train_df_clean.isnull().sum()


Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [26]:
hp_train_df_clean.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [27]:
hp_train_df_clean.nunique()

Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       111
LotArea          1073
                 ... 
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
SalePrice         663
Length: 81, dtype: int64

In [21]:
subset_columns = ['Id']
duplicate_rows_subset = hp_train_df[hp_train_df.duplicated(subset=subset_columns)]
duplicate_rows_subset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


In [23]:
hp_test_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [24]:
# dropping the null values
hp_test_df_clean = hp_test_df.copy()


# Impute missing values in numerical columns with the mean
hp_test_df_clean.fillna(hp_test_df_clean.mean(), inplace=True)

# Impute missing values in categorical columns with the mode
hp_test_df_clean.fillna(hp_test_df_clean.mode().iloc[0], inplace=True)

hp_test_df_clean

  hp_test_df_clean.fillna(hp_test_df_clean.mean(), inplace=True)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,Grvl,Reg,Lvl,AllPub,...,120,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,Grvl,IR1,HLS,AllPub,...,144,0,Ex,MnPrv,Shed,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,700,7,2006,WD,Normal


In [25]:
hp_test_df_clean.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

In [28]:
hp_test_df_clean.nunique()

Id               1459
MSSubClass         16
MSZoning            5
LotFrontage       116
LotArea          1106
                 ... 
MiscVal            26
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
Length: 80, dtype: int64

In [29]:
subset_columns = ['Id']
duplicate_rows_subset = hp_train_df[hp_train_df.duplicated(subset=subset_columns)]
duplicate_rows_subset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


In [37]:
# Identify categorical columns
categorical_columns = hp_train_df_clean.select_dtypes(include=['object']).columns

# # Handle missing values (if needed)
# # For simplicity, you can fill missing values with a placeholder
# df[categorical_columns] = df[categorical_columns].fillna('Missing')

#columns=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

# Create dummy variables for categorical columns
Housing_clean = pd.get_dummies(hp_train_df_clean, columns=categorical_columns)
Housing_clean

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,0,0,1,0,0,0,0,1,0
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,0,0,0,1,0,0,0,0,1,0
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,0,0,1,0,0,0,0,1,0
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,0,0,0,1,0,0,0,0,1,0


In [38]:
# Separate target variable (Y) and features (X)
X = Housing_clean.drop('SalePrice', axis=1)  # Drop the target variable
y = Housing_clean['SalePrice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestRegressor to assess feature importance
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Use feature importance scores to select relevant features
sfm = SelectFromModel(rf_model, threshold=0.01)
sfm.fit(X_train, y_train)
selected_features = X_train.columns[sfm.get_support()]

# Display selected features
print("Selected Features:")
print(selected_features)


Selected Features:
Index(['LotArea', 'OverallQual', 'YearBuilt', 'BsmtFinSF1', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageCars', 'GarageArea'],
      dtype='object')


In [41]:
# Assuming X_train is your feature matrix
# Create an instance of the StandardScaler
scaler = StandardScaler()
# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)
# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [42]:
print(X_train_scaled)
print(X_test_scaled)

[[-1.11928402 -0.8667643  -0.01288179 ... -0.12510865  0.46001984
  -0.30263639]
 [ 0.79046412  0.07410996 -0.5027726  ... -0.12510865  0.46001984
  -0.30263639]
 [-0.21615189 -0.63154574 -0.14648837 ... -0.12510865  0.46001984
  -0.30263639]
 ...
 [ 1.32669882 -0.8667643  -0.45823707 ... -0.12510865  0.46001984
  -0.30263639]
 [ 0.30597137 -0.16110861 -0.68091471 ... -0.12510865  0.46001984
  -0.30263639]
 [ 0.93157852  1.48542135 -0.76998577 ... -0.12510865  0.46001984
  -0.30263639]]
[[ 0.38123238 -0.8667643  -0.01288179 ... -0.12510865  0.46001984
  -0.30263639]
 [ 0.88218848  0.07410996  1.23411301 ... -0.12510865  0.46001984
  -0.30263639]
 [-0.74533088 -0.63154574 -0.63637918 ... -0.12510865  0.46001984
  -0.30263639]
 ...
 [ 0.16956079  0.07410996 -0.32463048 ... -0.12510865  0.46001984
  -0.30263639]
 [-0.18557711  0.30932853 -0.45823707 ... -0.12510865  0.46001984
  -0.30263639]
 [-0.01859174 -0.8667643  -0.01288179 ... -0.12510865  0.46001984
  -0.30263639]]


In [45]:
# Instantiate the Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model on the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.00684931506849315
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report:
              precision    recall  f1-score   support

       34900       0.00      0.00      0.00         0
       35311       0.00      0.00      0.00         1
       40000       0.00      0.00      0.00         1
       55000       0.00      0.00      0.00         0
       55993       0.00      0.00      0.00         1
       60000       0.00      0.00      0.00         1
       64500       0.00      0.00      0.00         1
       66500       0.00      0.00      0.00         1
       67000       0.00      0.00      0.00         2
       68400       0.00      0.00      0.00         1
       68500       0.00      0.00      0.00         1
       75000       0.00      0.00      0.00         1
       75500       0.00      0.00      0.00         1
       79500       0.00      0.00      0.00         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:

# Instantiate the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Display the model coefficients and intercept
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

Mean Squared Error: 872530446.3443108
R-squared: 0.8862459792744217
Model Coefficients: [ 4.04968908e-01  4.97635927e+00  9.19869404e+00  6.62633301e-01
  6.65164993e+03  5.83502708e+03  3.18143380e+02  1.30711760e+02
  2.38839451e+01  1.67398323e+01  6.64920777e+00 -1.54671737e+00
  2.18422222e+01  1.69963601e+01  3.25382455e+01 -2.46321988e+01
  2.49023742e+01  1.68402090e+03 -2.47349296e+03  2.98541170e+03
  2.12627669e+03 -2.57266468e+03 -1.27343579e+04  1.97044770e+03
  2.80793681e+03  2.19552118e+01  7.26178149e+02  2.52819259e+01
  1.68524231e+01  4.78087600e-01 -5.19341653e+00  5.09409591e+01
  3.32932154e+01  8.18789224e+01  1.78614443e+00 -4.31147247e+02
 -3.19491754e+02 -1.68285403e+04  1.05613424e+04  1.55165316e+03
  2.88199065e+03  1.83355407e+03 -1.11157578e+04  1.11157578e+04
 -2.47623978e+03  2.47623978e+03 -4.04314602e+03  3.29363652e+03
  2.98661530e+03 -2.23710580e+03 -3.54088232e+03  1.12714380e+04
 -9.61270323e+03  1.88214753e+03  1.81700017e+04 -1.81700017e+04
  