In [2]:
# XGBoost Model Notebook

In [5]:
# --------------------- #

In [6]:
# import functions from model_functions notebook

In [11]:
%run -i model_functions.ipynb

In [120]:
def run_model():
    
    train_data, test_data = get_data()
    
    #modification: remove outliers for 'SalePrice' vs 'GrLivArea'
    train_data.drop(train_data[(train_data['GrLivArea']>4000) & (train_data['SalePrice']<300000)].index)
    
    target = train_data['SalePrice'].apply(lambda x: np.log(x))
    train_data = remove_null_columns(train_data)
    numerical_features, categorical_features = get_feature_lists(train_data)
    
    df_num_train, df_cat_train = impute_features(train_data[numerical_features].copy(), train_data[categorical_features].copy())
    df_num_test, df_cat_test = impute_features(test_data[numerical_features].copy(), test_data[categorical_features].copy())
    
    common_categories = prune_categorical_features(df_cat_train, df_cat_test)
    
    df_train = combine_dataframes(df_num_train, df_cat_train[common_categories])
    df_test = combine_dataframes(df_num_test, df_cat_test[common_categories])
    
    df_train = feature_engineering(df_train.copy())
    df_test = feature_engineering(df_test.copy())
    
    # Change the line below to substitute different models
    model = xgboost_model(df_train, target=target)
    
    predictions = np.exp(model.predict(df_test))
    
    create_submission(test_data, predictions)
    
    return model, df_train, df_test, predictions

In [121]:
optimal_model, df_train, df_test, predictions = run_model()

Grabbing train data from 'data/train.csv'...
Grabbing test data from 'data/test.csv'...
Data successfully loaded.
Removing columns that are >50% null...
4 columns were removed.
Obtaining feature column names...
34 numeric features obtained; 41 categorical features obtained
Imputing data...
Imputation complete.
There are now 34 numerical features and 177 categorical features.
Imputing data...
Imputation complete.
There are now 34 numerical features and 172 categorical features.
Starting feature engineering...
Feature engineering: transforming years to time since...
Feature engineering: log transform of relevant numerical features...
Feature engineering complete.
Starting feature engineering...
Feature engineering: transforming years to time since...
Feature engineering: log transform of relevant numerical features...
Feature engineering complete.
Building xgboost model...
The model's R2 score is: 0.9111948388306389
Generating submission file...
Submission file is ready.


In [112]:
feature_importances = get_rf_feature_importances(df=df_train, rf_model=optimal_model)

Gathering feature importances...
Featue importances obtained.


In [113]:
feature_importances[0:30]

LotArea                 0.064962
LotFrontage             0.058824
BsmtUnfSF               0.047059
GrLivArea               0.045524
OverallQual             0.042967
HouseSFScore            0.040921
BsmtFinSF1              0.036317
GarageArea              0.030691
PorchScore              0.028133
BsmtIncompleteRatio     0.027621
2ndFlrSF                0.026598
GarageYrBlt             0.026598
MasVnrArea              0.026598
overallScore            0.024041
WoodDeckSF              0.023529
1stFlrSF                0.021483
YearBuilt               0.019437
OverallCond             0.018926
ageScore                0.018926
TotalBsmtSF             0.016368
YearRemodAdd            0.013299
BathScore               0.012276
Neighborhood_StoneBr    0.012276
OpenPorchSF             0.011765
YrSold                  0.011253
BsmtFinSF2              0.009719
Neighborhood_Crawfor    0.009207
ScreenPorch             0.008696
Functional_NonTyp       0.008696
Neighborhood_Somerst    0.008696
dtype: flo

In [27]:
list(feature_importances[0:30].index)

['LotArea',
 'LotFrontage',
 'BsmtUnfSF',
 'GrLivArea',
 'OverallQual',
 'HouseSFScore',
 'BsmtFinSF1',
 'GarageArea',
 'PorchScore',
 'BsmtIncompleteRatio',
 '2ndFlrSF',
 'GarageYrBlt',
 'MasVnrArea',
 'overallScore',
 'WoodDeckSF',
 '1stFlrSF',
 'YearBuilt',
 'OverallCond',
 'ageScore',
 'TotalBsmtSF',
 'YearRemodAdd',
 'BathScore',
 'Neighborhood_StoneBr',
 'OpenPorchSF',
 'YrSold',
 'BsmtFinSF2',
 'Neighborhood_Crawfor',
 'ScreenPorch',
 'Functional_NonTyp',
 'Neighborhood_Somerst']

In [None]:
# Tinkering with using xgboost output as input to other models

In [50]:
raw_train, raw_test = get_data()

Grabbing train data from 'data/train.csv'...
Grabbing test data from 'data/test.csv'...
Data successfully loaded.


In [48]:
training_predictions = pd.Series(optimal_model.predict(df_train), index=df_train.index)

In [90]:
testing_predictions = pd.Series(optimal_model.predict(df_test), index=df_test.index)

In [96]:
xgb_train = pd.DataFrame(training_predictions, columns=['xgb_influence'], index=df_train.index)

In [93]:
xgb_test = pd.DataFrame(testing_predictions, columns=['xgb_influence'], index=df_test.index)

In [103]:
xgb_train.to_csv('xgb_influence_train.csv', index=False)

In [104]:
xgb_test.to_csv('xgb_influence_test.csv', index=False)