In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!sudo pip install xgboost
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
pd.pandas.set_option('display.max_columns', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read the files

In [None]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
# finding out number of features and records in the training dataset
train_df.shape

## Analyze and find out the type of each feature

In [None]:
# Finding out features that is temporal
features = train_df.columns
temporal_features = [feature for feature in features if (('Yr' in feature) or ('Year' in feature))]
temporal_features

In [None]:
# finding out numerical features
numerical_features = [feature for feature in features if (train_df[feature].dtypes) != 'object']
print(f'Number of numerical features: {len(numerical_features)}')
print(numerical_features)

In [None]:
# Finding out categorical features
categorical_features = [feature for feature in numerical_features if ((feature not in temporal_features) and (train_df[feature].nunique() < 25))]
string_features = [feature for feature in features if (train_df[feature].dtypes) == 'object']
categorical_features = categorical_features + string_features
print(f'Number of categorical features: {len(categorical_features)}')
print(categorical_features)

In [None]:
# finding out features with continous values
continous_features = [feature for feature in features if ((feature not in categorical_features+temporal_features) and (feature != "Id"))]
print(f'Number of continous features: {len(continous_features)}')
print(continous_features)

## Analyzing the distribution of continous values

In [None]:
for feature in continous_features:
    train_df[feature].hist(bins=30)
    plt.title(feature)
    plt.show()

Most of the continous features is positively skewed. To achieve normal distribution, I will use log transformation

## Tranforming training with log transformation

In [None]:
for feature in continous_features:
    if 0 in train_df[feature].unique():
        pass
    else:
        train_df[feature] = np.log(train_df[feature])

In [None]:
# checking if the features with no zero values is correctly transformed
for feature in continous_features:
    train_df[feature].hist(bins=30)
    plt.title(feature)
    plt.show()

## Identifying Outliers

In [None]:
for feature in continous_features:
    train_df.boxplot(column=feature)
    plt.title(feature)
    plt.show()

There's a lot of outliers in this dataset. So it's better to use median/mode to handle the missing values

## Handling Missing Values

In [None]:
# function to count missing values percentages
def missing_percentage(df, features):
    missing = df[features].isnull().sum()*100/len(df)
    print (missing[missing>0].sort_values())

In [None]:
# Missing values percentage for string categorical data
missing_percentage(train_df, string_features)

In [None]:
train_df.head()

In [None]:
# Handling missing values in categorical data with string values
# Missing string values will be replaced with new category
for feature in string_features:
    train_df[feature] = train_df[feature].fillna('None')
    test_df[feature] = test_df[feature].fillna('None')

In [None]:
missing_percentage(train_df, string_features)

In [None]:
missing_percentage(test_df, string_features)

In [None]:
# Missing value percentage for numerical features
missing_percentage(train_df, numerical_features)

In [None]:
numerical_features.pop()

In [None]:
missing_percentage(test_df, numerical_features)

In [None]:
# Handling missing values for numerical features
# Missing values will be replaced by the median values for that particular features

for feature in numerical_features:
    train_df[feature].fillna(train_df[feature].median(), inplace=True)
    test_df[feature].fillna(test_df[feature].median(), inplace=True)

In [None]:
missing_percentage(train_df, numerical_features)

In [None]:
missing_percentage(test_df, numerical_features)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Transforming test data with log transformation

In [None]:
continous_features.pop()

In [None]:
continous_features

In [None]:
for feature in continous_features:
    test_df[feature].hist(bins=30)
    plt.title(feature)
    plt.show()

In [None]:
for feature in continous_features:
    if 0 in test_df[feature].unique():
        pass
    else:
        test_df[feature] = np.log(test_df[feature])

In [None]:
for feature in continous_features:
    test_df[feature].hist(bins=30)
    plt.title(feature)
    plt.show()

## Handling rare categorical values

In [None]:
categorical_features=[feature for feature in train_df.columns if train_df[feature].dtype=='O']

In [None]:
categorical_features

In [None]:
train_df["train"] = 1
test_df["train"] = 0
concatted = pd.concat([train_df, test_df], axis=0)


In [None]:
for feature in categorical_features:
    category_percentage = concatted.groupby(feature)['LotArea'].count()/len(concatted)
    index_rare_category = category_percentage[category_percentage > 0.01].index
    concatted[feature] = np.where(concatted[feature].isin(index_rare_category), concatted[feature], 'Other')

In [None]:
concatted.head()

## Applying one hot encoding

In [None]:
concatted['MSSubClass'] = concatted['MSSubClass'].apply(str)

In [None]:
categorical_concatted_df = concatted.select_dtypes(include='object')
numerical_concatted_df = concatted.select_dtypes(exclude='object')

In [None]:
categorical_concatted_df

In [None]:
categorical_dummies_concatted = pd.get_dummies(categorical_concatted_df, drop_first=True)

In [None]:
final_combined = pd.concat([categorical_dummies_concatted, numerical_concatted_df], axis=1)

In [None]:
train_df = final_combined[final_combined["train"] == 1]
test_df = final_combined[final_combined["train"] == 0]

In [None]:
train_df.drop(['train'], axis=1, inplace=True)
test_df.drop(['train', 'SalePrice'], axis=1, inplace=True)

In [None]:
train_df

## Implementing Gradient Boosting

In [None]:
X = train_df.drop(['SalePrice'], axis=1)
y = train_df['SalePrice']
grid_param = {'learning_rate': [0.05, 0.1, 0.2], 'n_estimators': [64,100,128], 'max_depth': [2, 3, 4, 5]}
grid_search_obj = GridSearchCV(GradientBoostingRegressor(), param_grid = grid_param)
grid_search_obj.fit(X, y)

In [None]:
grid_search_obj.best_params_

In [None]:
gb_obj = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, n_estimators=128)
gb_obj.fit(X, y)

In [None]:
y_pred = gb_obj.predict(test_df)

In [None]:
y_pred_inverse = np.exp(y_pred)

In [None]:
pred = pd.DataFrame(y_pred_inverse)
sub_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
datasets=pd.concat([sub_df['Id'],pred],axis=1)
datasets.columns=['Id','SalePrice']
datasets.to_csv('sample_submission.csv',index=False)

## Implementing XGBoost

In [None]:
param_grid = {'n_estimators': [64, 100, 128], 'max_depth': [2, 3, 4, 5], 'eta': [0.01, 0.1, 0.3], 'subsample': [0, 0.1, 0.3, 0.5, 0.7, 1], "colsample_bytree": [0, 0.1, 0.3, 0.5, 0.7, 1]}
grid_search_obj = GridSearchCV(XGBRegressor(), param_grid=param_grid)
grid_search_obj.fit(X, y)

In [None]:
grid_search_obj.best_params_

In [None]:
xg_obj = XGBRegressor(colsample_bytree= 0.5,eta= 0.1,max_depth = 4,n_estimators = 128,subsample = 1)

In [None]:
xg_obj.fit(X, y)

In [None]:
y_pred = xg_obj.predict(test_df)

In [None]:
y_pred

In [None]:
y_pred_inverse = np.exp(y_pred)

In [None]:
pred = pd.DataFrame(y_pred_inverse)
sub_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
datasets=pd.concat([sub_df['Id'],pred],axis=1)
datasets.columns=['Id','SalePrice']
datasets.to_csv('sample_submission.csv',index=False)

In [None]:
y_pred_inverse