In [1]:
import pandas as pd
data = pd.read_csv('house_data.csv')
print("Original Data:")
print(data.head())

Original Data:
   House_ID  Bedrooms  Size (sq ft)  Price (INR)      Location  Year_Built
0         1       4.0         855.0   31356226.0          Juhu      2002.0
1         2       5.0        1847.0   27775439.0       Andheri      2004.0
2         3       NaN        2363.0   37325149.0        Bandra      2000.0
3         4       5.0         626.0    6147116.0  South Mumbai      2002.0
4         5       5.0           NaN   49899606.0         Worli         NaN


In [2]:
 #Imputation
 #Handle missing values using median for numerical columns and the most frequent value for categorical columns.
 from sklearn.impute import SimpleImputer
 num_features = ['Bedrooms', 'Size (sq ft)', 'Price (INR)', 'Year_Built']
 cat_features = ['Location']
 num_imputer = SimpleImputer(strategy='median')
 data[num_features] = num_imputer.fit_transform(data[num_features])
 cat_imputer = SimpleImputer(strategy='most_frequent')
 data[cat_features] = cat_imputer.fit_transform(data[cat_features])
 print("\nData After Imputation:")
 print(data.head())


Data After Imputation:
   House_ID  Bedrooms  Size (sq ft)  Price (INR)      Location  Year_Built
0         1       4.0         855.0   31356226.0          Juhu      2002.0
1         2       5.0        1847.0   27775439.0       Andheri      2004.0
2         3       5.0        2363.0   37325149.0        Bandra      2000.0
3         4       5.0         626.0    6147116.0  South Mumbai      2002.0
4         5       5.0        1351.0   49899606.0         Worli      2002.0


In [4]:
 #Anomaly Detection
 #Detect anomalies in the dataset. Here, we use Z-scores to identify anomalies in the Price (INR) column.
 from scipy import stats
 z_scores = stats.zscore(data[num_features])
 data['Anomaly'] = (abs(z_scores) > 3).any(axis=1)  # Mark anomalies
 print("\nData After Anomaly Detection:")
 print(data.head())


Data After Anomaly Detection:
   House_ID  Bedrooms  Size (sq ft)  Price (INR)      Location  Year_Built  \
0         1       4.0         855.0   31356226.0          Juhu      2002.0   
1         2       5.0        1847.0   27775439.0       Andheri      2004.0   
2         3       5.0        2363.0   37325149.0        Bandra      2000.0   
3         4       5.0         626.0    6147116.0  South Mumbai      2002.0   
4         5       5.0        1351.0   49899606.0         Worli      2002.0   

   Anomaly  
0    False  
1    False  
2    False  
3    False  
4    False  


In [6]:
# Rule-Based Anomaly Detection
# Simple rules where:
# A house with less than 1000 sq ft should have 1 to 2 bedrooms.
# A house with 1000-2000 sq ft should have 2 to 4 bedrooms.
# A house with more than 2000 sq ft should have 3 or more bedrooms.
def is_bedroom_size_reasonable(row):
    if row['Size (sq ft)'] < 1000:
        return 1 <= row['Bedrooms'] <= 2
    elif row['Size (sq ft)'] <= 2000:
        return 2 <= row['Bedrooms'] <= 4
    else:
        return row['Bedrooms'] >= 3

data['Bed_Size_Anomaly'] = ~data.apply(is_bedroom_size_reasonable, axis=1)
print("\nData After Rule-Based Anomaly Detection:")
print(data.head())


Data After Rule-Based Anomaly Detection:
   House_ID  Bedrooms  Size (sq ft)  Price (INR)      Location  Year_Built  \
0         1       4.0         855.0   31356226.0          Juhu      2002.0   
1         2       5.0        1847.0   27775439.0       Andheri      2004.0   
2         3       5.0        2363.0   37325149.0        Bandra      2000.0   
3         4       5.0         626.0    6147116.0  South Mumbai      2002.0   
4         5       5.0        1351.0   49899606.0         Worli      2002.0   

   Anomaly  Bed_Size_Anomaly  
0    False              True  
1    False              True  
2    False             False  
3    False              True  
4    False              True  


In [7]:
 #Standardization
 #Standardize numerical features so they have a mean of 0 and a standard deviation of 1.
 from sklearn.preprocessing import StandardScaler
 # Standardize numericals
 scaler = StandardScaler()
 data[num_features] = scaler.fit_transform(data[num_features])
 print("\nData After Standardization:")
 print(data.head())


Data After Standardization:
   House_ID  Bedrooms  Size (sq ft)  Price (INR)      Location  Year_Built  \
0         1      -2.0     -0.869514     0.059777          Juhu    0.000000   
1         2       0.5      0.689138    -0.190420       Andheri    1.581139   
2         3       0.5      1.499888     0.476838        Bandra   -1.581139   
3         4       0.5     -1.229324    -1.701637  South Mumbai    0.000000   
4         5       0.5     -0.090188     1.355442         Worli    0.000000   

   Anomaly  Bed_Size_Anomaly  
0    False              True  
1    False              True  
2    False             False  
3    False              True  
4    False              True  


In [8]:
 #Normalization
 #Normalize numerical features to fit within the range [0, 1]
 from sklearn.preprocessing import MinMaxScaler

 normalizer = MinMaxScaler()
 data[num_features] = normalizer.fit_transform(data[num_features])
 print("\nData After Normalization:")
 print(data.head())


Data After Normalization:
   House_ID  Bedrooms  Size (sq ft)  Price (INR)      Location  Year_Built  \
0         1       0.0      0.131836     0.576175          Juhu         0.5   
1         2       1.0      0.702936     0.494334       Andheri         1.0   
2         3       1.0      1.000000     0.712600        Bandra         0.0   
3         4       1.0      0.000000     0.000000  South Mumbai         0.5   
4         5       1.0      0.417386     1.000000         Worli         0.5   

   Anomaly  Bed_Size_Anomaly  
0    False              True  
1    False              True  
2    False             False  
3    False              True  
4    False              True  


In [9]:
 #Encoding
 #One-Hot Encode the categorical feature Location.
 from sklearn.preprocessing import OneHotEncoder
 # One-Hot Encoding for 'Location'
 encoder = OneHotEncoder(sparse=False)
 encoded_location = encoder.fit_transform(data[['Location']])
 encoded_df = pd.DataFrame(encoded_location, columns=encoder.get_feature_names_out(['Location']))

 data_encoded = pd.concat([data, encoded_df], axis=1).drop('Location', axis=1)

 print("\nData After Encoding:")
 print(data_encoded.head())


Data After Encoding:
   House_ID  Bedrooms  Size (sq ft)  Price (INR)  Year_Built  Anomaly  \
0         1       0.0      0.131836     0.576175         0.5    False   
1         2       1.0      0.702936     0.494334         1.0    False   
2         3       1.0      1.000000     0.712600         0.0    False   
3         4       1.0      0.000000     0.000000         0.5    False   
4         5       1.0      0.417386     1.000000         0.5    False   

   Bed_Size_Anomaly  Location_Andheri  Location_Bandra  Location_Juhu  \
0              True               0.0              0.0            1.0   
1              True               1.0              0.0            0.0   
2             False               0.0              1.0            0.0   
3              True               0.0              0.0            0.0   
4              True               0.0              0.0            0.0   

   Location_South Mumbai  Location_Worli  
0                    0.0             0.0  
1             

