# Removing Missing Values and cleaning data

In [32]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(r'C:\Users\test\Downloads\AmesHousing.csv')

# Display the first few rows
print(df.head())


   Order        PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street  \
0      1  526301100           20        RL         141.0     31770   Pave   
1      2  526350040           20        RH          80.0     11622   Pave   
2      3  526351010           20        RL          81.0     14267   Pave   
3      4  526353030           20        RL          93.0     11160   Pave   
4      5  527105010           60        RL          74.0     13830   Pave   

  Alley Lot Shape Land Contour  ... Pool Area Pool QC  Fence Misc Feature  \
0   NaN       IR1          Lvl  ...         0     NaN    NaN          NaN   
1   NaN       Reg          Lvl  ...         0     NaN  MnPrv          NaN   
2   NaN       IR1          Lvl  ...         0     NaN    NaN         Gar2   
3   NaN       Reg          Lvl  ...         0     NaN    NaN          NaN   
4   NaN       IR1          Lvl  ...         0     NaN  MnPrv          NaN   

  Misc Val Mo Sold Yr Sold Sale Type  Sale Condition  SalePrice  
0       

In [33]:
# Remove duplicate rows
df = df.drop_duplicates()


In [34]:
#Convert object columns with < 50 unique values to 'category'
for col in df.select_dtypes(include='object').columns:
    if df[col].nunique() < 50:
        df[col] = df[col].astype('category')

In [35]:
#Normalize numeric columns using MinMaxScaler
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [36]:
# Exclude ID-like or encoded categorical numeric columns
exclude_cols = ['Order', 'PID', 'MS SubClass', 'Mo Sold', 'Yr Sold']
normalize_cols = [col for col in numeric_cols if col not in exclude_cols]

In [38]:
# Fill missing values in the normalization columns
df[normalize_cols] = df[normalize_cols].fillna(df[normalize_cols].mean())
# Apply Min-Max normalization
scaler = MinMaxScaler()
df[normalize_cols] = scaler.fit_transform(df[normalize_cols])

In [40]:
#Save cleaned data
df.to_csv('cleaned_data.csv', index=False)
print(df.head())


   Order        PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street  \
0      1  526301100           20        RL      0.410959  0.142420   Pave   
1      2  526350040           20        RH      0.202055  0.048246   Pave   
2      3  526351010           20        RL      0.205479  0.060609   Pave   
3      4  526353030           20        RL      0.246575  0.046087   Pave   
4      5  527105010           60        RL      0.181507  0.058566   Pave   

  Alley Lot Shape Land Contour  ... Pool Area Pool QC  Fence Misc Feature  \
0   NaN       IR1          Lvl  ...       0.0     NaN    NaN          NaN   
1   NaN       Reg          Lvl  ...       0.0     NaN  MnPrv          NaN   
2   NaN       IR1          Lvl  ...       0.0     NaN    NaN         Gar2   
3   NaN       Reg          Lvl  ...       0.0     NaN    NaN          NaN   
4   NaN       IR1          Lvl  ...       0.0     NaN  MnPrv          NaN   

   Misc Val Mo Sold Yr Sold Sale Type  Sale Condition  SalePrice  
0  0.00