In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore

In [4]:
df = pd.read_csv("house-prices.csv")
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1242,20,RL,83.0,9849,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2007,New,Partial,248328
1,1233,90,RL,70.0,9842,Pave,,Reg,Lvl,AllPub,...,0,,,,0,3,2007,WD,Normal,101800
2,1401,50,RM,50.0,6000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2008,WD,Normal,120000
3,1377,30,RL,52.0,6292,Pave,,Reg,Bnk,AllPub,...,0,,,,0,4,2008,WD,Normal,91000
4,208,20,RL,,12493,Pave,,IR1,Lvl,AllPub,...,0,,GdWo,,0,4,2008,WD,Normal,141000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1190,60,RL,60.0,7500,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,189000
996,192,60,RL,,7472,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2007,WD,Normal,184000
997,990,60,FV,65.0,8125,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2006,New,Partial,197000
998,982,60,RL,98.0,12203,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2009,WD,Normal,336000


In [13]:
dtypes = df.dtypes
dtypes


Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 80, dtype: object

In [14]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
numeric_df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1242,20,83.000000,9849,7,6,2007,2007,0.0,0,...,0,56,0,0,0,0,0,6,2007,248328
1,1233,90,70.000000,9842,4,5,1962,1962,0.0,0,...,0,0,0,0,0,0,0,3,2007,101800
2,1401,50,50.000000,6000,6,7,1929,1950,0.0,0,...,0,0,112,0,0,0,0,7,2008,120000
3,1377,30,52.000000,6292,6,5,1930,1950,0.0,384,...,0,141,0,0,0,0,0,4,2008,91000
4,208,20,69.303507,12493,4,5,1960,1960,0.0,419,...,355,0,0,0,0,0,0,4,2008,141000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,985,90,75.000000,10125,5,5,1977,1977,0.0,0,...,0,0,0,0,0,0,0,8,2009,126000
712,582,20,98.000000,12704,8,5,2008,2009,306.0,0,...,0,90,0,0,0,0,0,8,2009,253293
713,668,20,65.000000,8125,6,5,1994,1998,258.0,1138,...,224,42,0,0,0,0,0,10,2008,193500
714,1190,60,60.000000,7500,7,5,1999,1999,0.0,0,...,140,60,0,0,0,0,0,6,2010,189000


In [5]:
# Extract columns with missing values
missing_cols = df.columns[df.isnull().any()].tolist()
print("Columns with missing values:", missing_cols)

Columns with missing values: ['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [6]:

# Count the number of lines with missing data
num_missing = df.isnull().sum(axis=1).sum()
print("Number of lines with missing data:", num_missing)

Number of lines with missing data: 5434


In [7]:
# Fill in missing values with mean for numeric properties
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Fill in missing values with median for numeric properties
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill in missing values with mode for categorical properties
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

In [8]:
# Calculate the threshold for number of missing values per row
num_cols = len(df.columns)
thresh = num_cols * 0.5

# Drop rows with more than the threshold number of missing values
df = df.dropna(thresh=thresh)

# Reset the index after dropping rows
df = df.reset_index(drop=True)

In [9]:
# Calculate the threshold for number of missing values per column
num_rows = len(df.index)
thresh = num_rows * 0.5

# Drop columns with more than the threshold number of missing values
df = df.dropna(thresh=thresh, axis=1)

# Reset the index after dropping columns
df = df.reset_index(drop=True)

In [10]:
# Check for duplicates
print("Number of duplicate samples before removal: ", df.duplicated().sum())

# Remove duplicates
df = df.drop_duplicates()

# Reset the index after removing duplicates
df = df.reset_index(drop=True)

# Check for duplicates after removal
print("Number of duplicate samples after removal: ", df.duplicated().sum())

Number of duplicate samples before removal:  284
Number of duplicate samples after removal:  0


In [17]:
# Initialize the MinMaxScaler object
scaler = MinMaxScaler()

# Select the attribute to be normalized
attribute_to_normalize = df['SalePrice']

# Fit and transform the attribute using MinMaxScaler
normalized_attribute = scaler.fit_transform(attribute_to_normalize.values.reshape(-1, 1))

# Replace the original attribute with the normalized attribute
df['SalePrice'] = normalized_attribute
df['SalePrice']

0      0.369599
1      0.115363
2      0.146941
3      0.096624
4      0.183378
         ...   
711    0.157352
712    0.378214
713    0.274469
714    0.266661
715    0.257986
Name: SalePrice, Length: 716, dtype: float64

In [19]:
# Select the attribute to be normalized
attribute_to_normalize = df['SalePrice']

# Normalize the attribute using zscore
normalized_attribute = zscore(attribute_to_normalize)

# Replace the original attribute with the normalized attribute
df['SalePrice'] = normalized_attribute
df['SalePrice']


0      0.895490
1     -0.981527
2     -0.748386
3     -1.119874
4     -0.479377
         ...   
711   -0.671526
712    0.959091
713    0.193146
714    0.135501
715    0.071451
Name: SalePrice, Length: 716, dtype: float64

In [22]:

# Perform addition
addition = numeric_df['OverallQual'] + numeric_df['OverallCond']

# Perform subtraction
subtraction =numeric_df['OverallQual'] - numeric_df['OverallCond']

# Perform multiplication
multiplication = numeric_df['OverallQual'] * numeric_df['OverallCond']

# Perform division
division = numeric_df['OverallQual'] / numeric_df['OverallCond']
