In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
dataset = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
Test=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
output=pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
Test = pd.merge(Test,output,on='Id',how='inner')

In [None]:
Test.head(5)

## 1- Missing Values

#### 1.1 - categorical missing values

In [None]:
features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes=='O']
for feature in features_nan:
    print(feature,np.round(dataset[feature].isnull().mean(),4),'% missing values')
dataset[features_nan].head(5)

In [None]:
# replacing with missing 
def replace_cat_nan(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data
dataset = replace_cat_nan(dataset,features_nan)
dataset[features_nan].isnull().sum()
dataset[features_nan].head(5)

In [None]:
features_nan = [feature for feature in Test.columns if Test[feature].isnull().sum()>1 and Test[feature].dtypes=='O']
Test[features_nan].head(5)
Test = replace_cat_nan(Test,features_nan)
Test[features_nan].isnull().sum()
Test[features_nan].head(5)

#### 1.2 - Numerical missing values

In [None]:
numerical_with_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!='O']
for feature in numerical_with_nan:
    print(feature,np.round(dataset[feature].isnull().mean(),4))

In [None]:
# replacing the numerical missing values with median as there are many outliers
for feature in numerical_with_nan:
    median_value=dataset[feature].median()    
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)   # if nan then 1 then 0
    dataset[feature].fillna(median_value,inplace=True)
dataset[numerical_with_nan].isnull().sum()
dataset[numerical_with_nan].head(5)

In [None]:
numerical_with_nan = [feature for feature in Test.columns if Test[feature].isnull().sum()>1 and Test[feature].dtypes!='O']
for feature in numerical_with_nan:
    median_value=Test[feature].median()    
    Test[feature+'nan']=np.where(Test[feature].isnull(),1,0)   # if nan then 1 then 0
    Test[feature].fillna(median_value,inplace=True)
Test[numerical_with_nan].isnull().sum()

#### 1.3 - Temporal values

In [None]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    dataset[feature]=dataset['YrSold']-dataset[feature]
dataset[['YearBuilt','YearRemodAdd','GarageYrBlt']].head(5)

In [None]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    Test[feature]=Test['YrSold']-Test[feature]

#### 1.4 - distribution of numerical values

In [None]:
# since the numerical values are skewed we will perform log normal distribution
num_features = ['LotFrontage','LotArea','1stFlrSF','GrLivArea','SalePrice']
print(dataset[num_features].head(5))
for feature in num_features:
    dataset[feature]=np.log(dataset[feature])
print(dataset[num_features].head(5))

In [None]:
num_features = ['LotFrontage','LotArea','1stFlrSF','GrLivArea','SalePrice']
for feature in num_features:
    Test[feature]=np.log(Test[feature])

### 2 - Handling Rare Categorical Feature

In [None]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtypes=='O']
dataset[categorical_features].head(5)

In [None]:
for feature in categorical_features:
    temp=dataset.groupby(feature)['SalePrice'].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')
dataset[categorical_features].head(5)

In [None]:
# label order encoding
for feature in categorical_features:
    labels_ordered=dataset.groupby(feature)['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    dataset[feature]=dataset[feature].map(labels_ordered)
dataset[categorical_features].head(5)

In [None]:
categorical_features = [feature for feature in Test.columns if Test[feature].dtypes=='O']
for feature in categorical_features:
    temp=Test.groupby(feature)['SalePrice'].count()/len(Test)
    temp_df=temp[temp>0.01].index
    Test[feature]=np.where(Test[feature].isin(temp_df),Test[feature],'Rare_var')

In [None]:
for feature in categorical_features:
    labels_ordered=Test.groupby(feature)['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    Test[feature]=Test[feature].map(labels_ordered)
Test[categorical_features].head(5)

### 3 - Feature Scaling

In [None]:
scaling_feature = [feature for feature in dataset.columns if feature not in ['Id','SalePrice']]
dataset[scaling_feature].head(5)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()
scaler.fit(dataset[scaling_feature])
scaler.transform(dataset[scaling_feature])

In [None]:
# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([dataset[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[scaling_feature]), columns=scaling_feature)],
                    axis=1)

In [None]:
data.to_csv('X_train.csv',index=False)

In [None]:
scaling_feature = [feature for feature in Test.columns if feature not in ['Id','SalePrice']]
scaler= MinMaxScaler()
scaler.fit(Test[scaling_feature])
scaler.transform(Test[scaling_feature])
data = pd.concat([Test[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(Test[scaling_feature]), columns=scaling_feature)],
                    axis=1)

In [None]:
data.shape

In [None]:
data.to_csv('X_test.csv',index=False)