#### A Notebook to get prepared data easily

Credits to this awesome exploration [*Notebook*](https://www.kaggle.com/code/pmarcelino/comprehensive-data-exploration-with-python)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:

def get_data(train_path,test_path):

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test


def clean_data(train_path,test_path):
    train , test = get_data(train_path,test_path)

    #missing data
    total = train.isnull().sum().sort_values(ascending=False)
    percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

    train = train.drop((missing_data[missing_data['Total'] > 1]).index,1)
    train = train.drop(train.loc[train['Electrical'].isnull()].index)

    # the same procedure for the test data
    test_missing_data = missing_data.iloc[:-1,:]
    test = test.drop((test_missing_data[test_missing_data['Total'] > 1]).index,1)

    total = test.isnull().sum().sort_values(ascending=False)
    percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

    for i in missing_data.index:
        if missing_data['Total'][missing_data.index == i][0]>0 :
            if test[i].dtype == 'int64' or test[i].dtype == 'float64':
                test[i].fillna(value=test[i].mean(), inplace=True)
            else :
                test[i].fillna(value=test[i].value_counts().index[0], inplace=True)

    train = train.drop(train.index[(train['Id'] == 1299) | (train["Id"] == 524)])         

    # applying log transformation

    train['SalePrice'] = np.log(train['SalePrice'])
    train['GrLivArea'] = np.log(train['GrLivArea'])
    test['GrLivArea'] = np.log(test['GrLivArea'])

    # we want to create a new column for a categorical value of having or not havong a basement
    train['HasBsmt'] = pd.Series(len(train['TotalBsmtSF']),index = train.index)
    train['HasBsmt'] = 0
    train.loc[train['TotalBsmtSF'] >0, 'HasBsmt'] = 1 

    test['HasBsmt'] = pd.Series(len(test['TotalBsmtSF']),index = test.index)
    test['HasBsmt'] = 0
    test.loc[test['TotalBsmtSF'] >0, 'HasBsmt'] = 1 

    test.loc[test['HasBsmt'] == 1, 'TotalBsmtSF'] = np.log(test['TotalBsmtSF'])
    train.loc[train['HasBsmt'] == 1, 'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])

    #convert categorical variable into dummy
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)

    X_train = train.drop(['SalePrice'],axis=1)
    y = train.SalePrice
    X_test = test

    # Get missing columns in the training test
    missing_cols = set( X_train.columns ) - set( X_test.columns )
    # Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        X_test[c] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    X_test = X_test[X_train.columns]

    return X_train,y,X_test