<a href="https://colab.research.google.com/github/calamistratus/Houses_project/blob/main/Fixing_the_NaNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [122]:
import numpy as np
import pandas as pd

from IPython.display import clear_output

In [101]:
 #  Fixes NaNs based on parametres provided.
 #  With no 'by' and 'gentle=False' fills NaNs with 0 or 'no'.
 #  With no 'by' and 'gentle=True' fills NaNs with either most popular object or mean of the dataframe.
 #  With array of 'by' columns makes a groupby and fills NaNs based on their parameters.
 #  With non-False 'by', scaler, model, and a goal(A target variable column name) fills NaNs based on a predictions of a given model based on other parameters.

def fix_na(series, gentle=False, by=[], dataframe=None, model=None, scaler=None, goal=''):
    if type(series) == str:
        series = dataframe[series]
    if by:
        if gentle:
            dataframe = dataframe.drop(goal, axis=1)
            for i in check_na(dataframe).keys().drop(series.name):
               fix_na_set(dataframe, i, True)

            train = pd.get_dummies(dataframe.drop(series.name, axis=1), drop_first=True)
            y = dataframe[series.name][~dataframe[series.name].isnull()]
            X = train[~dataframe[series.name].isnull()]
            X = scaler.fit_transform(X)

            model.fit(X, y)

            return series.fillna(pd.Series(model.predict(scaler.fit_transform(train))))

        else:
            #if by is str:
                #by = [by]
            match series.dtype:
                case 'float64':
                    groupby = dataframe.groupby(by)[series.name].mean()
                    key = series.mean()

                    def pick_pop(by_data):
                        if [1 for i in by_data if i is np.nan]:
                            return groupby.iloc[0]
                        else:
                            try:
                                return groupby[by_data]
                            except:
                                return key

                    return series.fillna(pd.Series(np.vectorize(pick_pop)(*[df[i] for i in by])))

                case 'object':
                    groupby = dataframe.groupby(by)[series.name].value_counts()
                    key = series.value_counts().keys()[0]

                    def pick_pop(*by_data):
                        if [1 for i in [by_data] if i is np.nan]:
                            x = eval('groupby.keys()[1]' + '[1]' * len([by_data]))
                            return x
                        else:
                            try:
                                return groupby[by_data].keys()[0]
                            except:
                                return key

                    return series.fillna(pd.Series(np.vectorize(pick_pop)(*[df[i] for i in by])))

    else:
        if gentle:
            match series.dtype:
                case 'float64':
                    return series.fillna(series.mean())
                case 'object':
                    return series.fillna(list(series.value_counts().keys())[0])

        else:
            match series.dtype:
                case 'float64':
                    return series.fillna(0)
                case 'object':
                    return series.fillna('no')


def fix_na_set(dataframe, column, gentle = False, by='', model=None, scaler=None, goal=''):   # Automatically sets the fixed column into a dataframe.
    dataframe[column] = fix_na(series = dataframe[column], gentle = gentle, by = by, dataframe = dataframe, model=model, scaler=scaler, goal=goal)



def check_na(dataframe, column = ''):   # Checks for NaNs in dataframe, if the column is provided, gives a more detailed summary
    if column:
        print('\n', dataframe[column].value_counts().head(10), '\n\n', 'Number of na:', dataframe[column].isnull().sum(), '\n')
        return pd.concat([dataframe[dataframe[column].isnull() > 0].head(),
                          dataframe.head(1) == np.nan,
                          dataframe.sample(len(dataframe[dataframe[column].isnull() > 0].head()))], axis=0)
    else:
        return dataframe.isnull().sum()[dataframe.isnull().sum() > 0]

In [120]:
def only_important(column, df, gentle=False, num=50):  # Either replaces values with frequency less than number given with 'no' or the most popular value if 'gentle' is specified
  counts = df[column].value_counts()
  if not gentle:
    def subfunc(my_str):
      try:
          if counts.loc[my_str] < num:
              return 'no'
          else:
              return my_str
      except KeyError:
          return 'no'
    return df[column].apply(subfunc)
  else:
    def subfunc(my_str):
      try:
          if counts.loc[my_str] < num:
              return counts.keys()[0]
          else:
              return my_str
      except KeyError:
          return 'no'
    return df[column].apply(subfunc)

In [121]:
path = '/content/drive/MyDrive/Documents/home-data-for-ml-course/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

df = pd.concat([train, test], axis=0, ignore_index=True)
df.to_csv('df.csv')   # might come handy for further statistics.

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

In [119]:
for i in df.columns:    # For manual check of every column.
  if input() != '' : break
  clear_output()
  check_na(df, i)

stop


In [109]:
df = df.set_index('Id')   # Self-explanatory.

In [110]:
fix_na_set(df, 'Alley')   # Most of houses just don't have an alley.
fix_na_set(df, 'MasVnrType')    # Most of houses just don't have an Vener.
fix_na_set(df, 'MasVnrArea')
fix_na_set(df, 'BsmtQual')    # Around 150 houses don't have a basement.
fix_na_set(df, 'BsmtCond')
fix_na_set(df, 'BsmtExposure')
fix_na_set(df, 'BsmtFinType1')
fix_na_set(df, 'BsmtFinSF1')
fix_na_set(df, 'BsmtFinType2')
fix_na_set(df, 'BsmtFinSF2')
fix_na_set(df, 'BsmtUnfSF')
fix_na_set(df, 'TotalBsmtSF')
fix_na_set(df, 'BsmtFullBath')
fix_na_set(df, 'BsmtHalfBath')
fix_na_set(df, 'FireplaceQu')   # Most houses don't have a fireplace.
fix_na_set(df, 'GarageType')    # Around 150 houses don't have a garage.
fix_na_set(df, 'GarageYrBlt')
fix_na_set(df, 'GarageFinish')
fix_na_set(df, 'GarageArea')
fix_na_set(df, 'GarageQual')
fix_na_set(df, 'GarageCond')
fix_na_set(df, 'Fence')   # 2348 Houses don't have a fence.
fix_na_set(df, 'MiscFeature')   # Most houses don't have additional miscalenious features.

In [111]:
fix_na_set(df, 'Exterior1st', True)   # A single missing NaN not worth futher research.
fix_na_set(df, 'Exterior2nd', True)
fix_na_set(df, 'KitchenQual', True)
fix_na_set(df, 'Functional', True)
fix_na_set(df, 'GarageCars', True)
fix_na_set(df, 'SaleType', True)
fix_na_set(df, 'Electrical', True)

In [112]:
df = df.drop('Street', axis=1)    # Only 12 insignificant features that differ from 'Pave'
df = df.drop('Utilities', axis=1)   # Only 2 NaNs and one house that's different.
df['PoolQC'] = ~df['PoolQC'].isna()   # Only 8 houses have a pool. It's worth mentioning though for it's value.

In [113]:
df['Condition2'] = only_important('Condition2', df, fill='gentle', num=10)    # Many feautres with little significance and counts of 1-9 that litter the data.
df['RoofMatl'] = only_important('RoofMatl', df, fill='gentle', num=10)    # Exeption was made for quality parameters for their sheer value.
df['RoofStyle'] = only_important('RoofStyle', df, fill='gentle', num=10)
df['Heating'] = only_important('RoofStyle', df, fill='gentle', num=10)
df['Electrical'] = only_important('Electrical', df, fill='gentle', num=10)
df['Functional'] = only_important('Electrical', df, fill='gentle', num=10)

In [114]:
fix_na_set(df, 'MSZoning', False, ['Neighborhood'])   # small value of NaNs which are related to 'Neighborhood'.

In [115]:
from xgboost import XGBRegressor    # Fast and effective in most situations.
from sklearn.preprocessing import StandardScaler

fix_na_set(df, 'LotFrontage', True, True, XGBRegressor(), StandardScaler(), 'SalePrice')    # Is hardly predictable, but I've tried using a model to capture most of what I've got.

In [116]:
df.isnull().sum().drop('SalePrice', axis=0)[df.isnull().sum().drop('SalePrice', axis=0) > 0]  # No NaNs left!

Unnamed: 0,0


In [117]:
df.to_csv('fixed_df.csv')