In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
import catboost
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from IPython.display import HTML
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# load train data and split them into nemric and catgorial Data Frame
df = pd.read_csv('../input/train.csv', index_col='Id')
X_df = df.drop('SalePrice', axis=1)
y_df = df.SalePrice

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_features = X_df.select_dtypes(include=numerics)
cat_features = X_df.select_dtypes(exclude=numerics)
X_df.shape

In [None]:
# three pre processor methods

QUANTILE_THRESHOLD = 0.3

def numeric_features_processor(X):
    """
    the methods deletes features with too many null obs
    and features with too low variance
    """
    total = X.isnull().sum().sort_values(ascending=False)
    percent = (X.isnull().sum()/X.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    name_features_to_delete = missing_data.head(1).index
    numeric_features = X.drop(name_features_to_delete, axis=1)
    # var threshold
    threshold_var = numeric_features.var().quantile([QUANTILE_THRESHOLD]).values[0]
    var_df = pd.DataFrame(numeric_features.var())
    var_cols_names = var_df[var_df[0]>threshold_var].index
    return numeric_features[var_cols_names]

def cat_features_processor(X, test):
    """
    the methods deletes features with too many null obs, features with too low variance
    and re-maps all categiral features that supose to be made with label-encoding.
    then it returns data-frame after calling to pandas methos - get_dummies
    return: Pandas Data Frame
    """
    #delete features with too many missing data
    if not test:
        total = X.isnull().sum().sort_values(ascending=False)
        percent = (X.isnull().sum()/X.isnull().count()).sort_values(ascending=False)
        missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        features_names_selected =  missing_data.loc[missing_data['Percent'] < 0.15].index
        cat_features = X[features_names_selected]
    else:
        cat_features = X
    col_names = []
    for col in cat_features:
        if 'Gd' in cat_features[col].unique():
            col_names.append(col)
            cat_features[col] = cat_features[col].map({np.nan:0,'No':1, 'Mn':2, 'Av':3,
                                               'Po':1, 'Fa':2, 'TA':3,'Gd':4 ,'Ex':5 })
    # make all the rest as dumies
    return pd.get_dummies(cat_features)

def features_pre_processing(X, test=False):
    """
    splits the Data_Frame into two seperate DataFrames, one with numeric and other with
    categorial Features DataFrame then sends them to the relevate methods (numeric_features_processor &
    cat_features_processor).
    sort every data frame by cols names.
    return: a tuple of two Data Frames
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X.select_dtypes(include=numerics)
    cat_features = X.select_dtypes(exclude=numerics)
    
    df_num = numeric_features_processor(numeric_features)
    df_num = df_num.reindex(sorted(df_num.columns), axis=1) #order by col's names
    df_cat = cat_features_processor(cat_features, test)
    df_cat = df_cat.reindex(sorted(df_cat.columns), axis=1) #order by col's names
    return df_num, df_cat

In [None]:
class Feature_selector(BaseEstimator, TransformerMixin):
    """
    a Transformer that knows how to select the best features of every dataset
    using combination of SelectKBest and RandomForrest
    
    inheritage:
    -BaseEstimator:
    implements get_params() and set_params()
    and 
    -TransformerMixin:
    implements automatic fit_transform method
    """
    def __init__(self, category=False):
        self.category = category
        
    def fit(self,X, y=None):
        """
        gets the X data set and checks the best Features based on the y data set
        and saves the Features names in a class member
        """
        if self.category:
            self.X_coloums_names = \
            categiral_transformer.steps[0][1].get_feature_names(train_cat_features.columns)
        else:
            self.X_coloums_names = \
            numeric_transformer.steps[1][1].get_feature_names(train_num_features.columns)
        
        regr = RandomForestRegressor(max_depth=4, random_state=42,n_estimators=100)
        regr.fit(X, y_df)
        feature_importances_by_rand_forrest = pd.Series(regr.feature_importances_, 
                        index=self.X_coloums_names).sort_values(ascending=False)
        
        kbest_selector = SelectKBest(f_regression, k=20)
        kbest_selector.fit_transform(X, y_df)
        feature_importances_by_kbest = pd.Series(kbest_selector.scores_,
                                         index=self.X_coloums_names).sort_values(ascending=False)
        
        df_feature_importances = pd.DataFrame([feature_importances_by_rand_forrest,
                                               feature_importances_by_kbest],
                                     index=['rand_forest', 'kbest']).transpose()
        
        self.selected_features_names = df_feature_importances.loc[
                           (df_feature_importances['rand_forest'] > 0) | 
                           (df_feature_importances['kbest'] > 
                            df_feature_importances['kbest'].quantile([0.95]).values[0])].index
        
        return self
    
    def transform(self, X, y=None):
        """
        return the best featres of the data set that was fitted by
        """
        X = pd.DataFrame(X, columns=self.X_coloums_names)
        selected_features = X[self.selected_features_names]
        return selected_features

In [None]:
# make piplines:

numeric_transformer = Pipeline([('imput',SimpleImputer(strategy='median')),
                                ('extract',PolynomialFeatures(2 ,include_bias=False)),
                                ('select', Feature_selector())])
categiral_transformer = Pipeline([('extract',PolynomialFeatures(2 ,include_bias=False)),
                                  ('select', Feature_selector(category=True))])

In [None]:
# creates the features dfs:
train_num_features, train_cat_features = features_pre_processing(X_df)

In [None]:
# runs dataframes trought the piplines
train_num_features = numeric_transformer.fit_transform(train_num_features)
train_num_features_names = train_num_features.columns

train_cat_features = categiral_transformer.fit_transform(train_cat_features)
train_cat_features_names = train_cat_features.columns

# concat the data frames into one
df_X_post_process = pd.concat([train_num_features, train_cat_features], axis=1)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(df_X_post_process, y_df, test_size=0.25, random_state=42)

In [None]:
# train the model by XGBOOST
df_X_post_process = df_X_post_process.reindex(sorted(df_X_post_process.columns), axis=1) # sort by cols names

xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 104)
xgb_reg.fit(df_X_post_process, y_df)
print(xgb_reg.score(df_X_post_process, y_df))#, xg_reg.score(X_test, y_test))

# test

In [None]:
test_df = pd.read_csv('../input/test.csv', index_col='Id')

In [None]:
test_num_features, test_cat_features = features_pre_processing(test_df, test=True)
test_num_cols, test_cat_cols = test_num_features.columns, test_cat_features.columns
test_num_features.head()

In [None]:
# process numeric features
numeric_test_transformer = Pipeline([('imput',SimpleImputer(strategy='median')),
                                ('extract',PolynomialFeatures(2 ,include_bias=False))])
test_num_features = numeric_test_transformer.fit_transform(test_num_features)
test_num_features = pd.DataFrame(test_num_features,
                                 columns=numeric_test_transformer.steps[1][1].get_feature_names(test_num_cols))
test_num_features = test_num_features[train_num_features_names]
test_num_features.shape

In [None]:
# process cat features
cat_test_extract = PolynomialFeatures(2 ,include_bias=False)
test_cat_features = cat_test_extract.fit_transform(test_cat_features)

test_cat_features = pd.DataFrame(test_cat_features,
                                 columns=cat_test_extract.get_feature_names(test_cat_cols))
# select importance features
## this way of selecting ignores un exists cols in test data - it may happens because not all of the
## values in every featres are actualy exists so make dummies wont make the same features as in the
## train data set
test_cat_features = test_cat_features.loc[:,test_cat_features.columns.isin(train_cat_features_names)]
test_cat_features.shape

In [None]:
df_X_test_post_process = pd.concat([test_num_features, test_cat_features], axis=1)

### create the un exists features in the test data set which exists in the train data sets (all of them dummies)

In [None]:
for col in df_X_post_process:
    if col not in df_X_test_post_process.columns:
        df_X_test_post_process[col] = 0
 
# sort by cols names
df_X_test_post_process = df_X_test_post_process.reindex(sorted(df_X_test_post_process.columns), axis=1)

### Download method of the submission part

In [None]:
df = pd.DataFrame(xgb_reg.predict(df_X_test_post_process), columns=['SalePrice'], index=test_df.index)

df.to_csv('submission.csv')

def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe which was saved with .to_csv method
create_download_link(filename='submission.csv')