**1. Import Packages**

In [1]:
#'Pandas' is used for data manipulation and analysis
import pandas as pd

#'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np

#'Matplotlib' is a data visualization library for 2D & 3D plots, built on numpy
import matplotlib.pyplot as plt

#'datetime' is used to perform date & time operations
import datetime as dt

#'StandardScaler' is from sklearn.preprocessing module is used to scale the data
from sklearn.preprocessing import StandardScaler

#'eig' from numpy.linalg to calculate eigen values & eigen vectors
from numpy.linalg import eig

#'PCA' function to perform Pricipal Component Analysis using sklearn library
from sklearn.decomposition import PCA

**2. Read the Data**

In [2]:
#read the data
raw_data = pd.read_csv(r"C:\zubeda\PGA02_Zubu\Machine Learning Models\PCA\Principal Component Analysis (PCA)\Dataset\houseprice.csv")

#print first 5 rows
raw_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


**3. Understand and Prepare the Data**

In [3]:
#the parameter 'index_col' will change the index to the specified column
raw_data = pd.read_csv(r"C:\zubeda\PGA02_Zubu\Machine Learning Models\PCA\Principal Component Analysis (PCA)\Dataset\houseprice.csv", index_col=0)

#print first 5 rows
raw_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


**3.1 Data Types and Dimensions**

In [4]:
#chect the data types for variables
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [5]:
#get the shape
print(raw_data.shape)

(1460, 80)


In [6]:
#Use .astype() to change the data type
#Use ' for loop' to change the data type of a large number of columns
for feature in ['MSSubClass', 'OverallQual', 'OverallCond']:
    raw_data[feature] = raw_data[feature].astype('object')

In [8]:
#recheck the data types
raw_data.dtypes

MSSubClass        object
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 80, dtype: object

**3.2 Feature Engineering**

In [9]:
#'now().year' gives the current year
#store the year as 'current_year'
current_year = int(dt.datetime.now().year)

In [10]:
#creating 2 new columns as 'Building_age' and 'Remodel_age'
Building_age = current_year - raw_data.YearBuilt
Remodel_age = current_year - raw_data.YearRemodAdd

In [11]:
#add the above columns in our dataframe
raw_data['Building_age'] = Building_age
raw_data['Remodel_age'] = Remodel_age

In [12]:
#print the head of the data to check whether the new columns are added or not
raw_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Building_age,Remodel_age
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,2,2008,WD,Normal,208500,18,18
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,,,0,5,2007,WD,Normal,181500,45,45
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,,,0,9,2008,WD,Normal,223500,20,19
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,,,0,2,2006,WD,Abnorml,140000,106,51
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,,,0,12,2008,WD,Normal,250000,21,21


In [13]:
raw_data.shape

(1460, 82)