Datacamp: "Extreme Gradient Boosting with XGBoost in Python"

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('ames_unprocessed_data.csv')

# PreProcessing

In [None]:
df.head()

In [None]:
df.info()

# LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder
df.LotFrontage = df.LotFrontage.fillna(0)
categorical_mask = (df.dtypes == object)
categorical_columns = df.columns[categorical_mask].tolist()
print(df[categorical_columns].head())
le = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))
print(df[categorical_columns].head())

# OneHotEncoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=categorical_mask,sparse=False)
df_encoded =ohe.fit_transform(df)
print(df_encoded[:5, :])
print(df.shape)
print(df_encoded.shape)

# DictVectorizer

In [None]:
from sklearn.feature_extraction import DictVectorizer
df_dict = df.to_dict('records')
dv = DictVectorizer()
df_encoded = dv.fit_transform(df_dict)
print(df_encoded[:5,:])
print(dv.vocabulary_)

# Pipeline

In [None]:
X=df.iloc[:,:-1]

In [None]:
X.head()

In [None]:
y=df['SalePrice']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer

X.LotFrontage = X.LotFrontage.fillna(0)

steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor())]

xgb_pipeline = Pipeline(steps)

xgb_pipeline.fit(X.to_dict('records'),y)

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Fill LotFrontage missing values with 0
#X.LotFrontage = X.LotFrontage.fillna(0)

steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))]

xgb_pipeline = Pipeline(steps)

cross_val_scores = cross_val_score(xgb_pipeline,X.to_dict('records'),y,scoring='neg_mean_squared_error',cv=10)

print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))

# Kidney Disease Case Study

In [None]:
chronic_disease=pd.read_csv('chronic_kidney_disease.csv',na_values=["?"],header=-1)

In [None]:
chronic_disease.head()

In [None]:
names=['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

In [None]:
X=chronic_disease.iloc[:,:-1]
X.columns=names
y=chronic_disease[24]

In [None]:
X.head()

In [None]:
X.info()

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import Imputer
nulls_per_column = X.isnull().sum()
print(nulls_per_column)
categorical_feature_mask = X.dtypes == object
categorical_columns = X.columns[categorical_feature_mask].tolist()
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

In [None]:
from sklearn.pipeline import FeatureUnion
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)])

# Kidney Disease Pipeline

In [None]:
pipeline = Pipeline([("featureunion", numeric_categorical_union),
                     ("dictifier", Dictifier()),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier(max_depth=3))])
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)
print("3-fold AUC: ", np.mean(cross_val_scores))

# Kidney Disease Tuning Hyperparameters

In [None]:
gbm_param_grid = {
    'clf__learning_rate': np.arange(0.05,1,0.05),
    'clf__max_depth': np.arange(3,10,1),
    'clf__n_estimators': np.arange(50,200,50)
}

randomized_roc_auc = RandomizedSearchCV(estimator=pipeline,param_distributions=gbm_param_grid,scoring="roc_auc",n_iter=2,cv=2,verbose=1)

randomized_roc_auc.fit(X,y)

print(randomized_roc_auc.best_score_)
print(randomized_roc_auc.best_estimator_)