# PREVIOUS WORK

In [None]:
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import skew
 
def mean_squared_error_(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions) ** 0.5
RMSE = make_scorer(mean_squared_error_, greater_is_better=False)    
    
def create_submission(prediction,score):
    now = datetime.datetime.now()
    sub_file = 'submission_'+str(score)+'_'+str(now.strftime("%Y-%m-%d-%H-%M"))+'.csv'
    print ('Creating submission: ', sub_file)
    pd.DataFrame({'Id': test['Id'].values, 'SalePrice': prediction}).to_csv(sub_file, index=False)

def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))
    
    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
    
def model_random_forecast(Xtrain,Xtest,ytrain):
    
    X_train = Xtrain
    y_train = ytrain
    rfr = RandomForestRegressor(n_jobs=1, random_state=0)
    param_grid = {'n_estimators': [1000]}
    # 'n_estimators': [1000], 'max_features': [10,15,20,25], 'max_depth':[20,20,25,25,]}
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Random forecast regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_


def model_extra_trees_regression(Xtrain,Xtest,ytrain):
    
    X_train = Xtrain
    y_train = ytrain
    
    etr = ExtraTreesRegressor(n_jobs=1, random_state=0)
    param_grid = {}#'n_estimators': [500], 'max_features': [10,15,20]}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Extra trees regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_


# read data, build model and do prediction
# read train data
train = pd.read_csv("../../input/train.csv")
# read test data 
test = pd.read_csv("../../input/test.csv") 
Xtrain,Xtest,ytrain = data_preprocess(train,test)


test_predict,score = model_random_forecast(Xtrain,Xtest,ytrain)
# test_predict,score = model_extra_trees_regression(Xtrain,Xtest,ytrain)

create_submission(np.exp(test_predict),score)

In [15]:
# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from scipy.linalg import svd
import sklearn
from sklearn.feature_selection import SelectKBest

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [20]:
def fill_missing_or_nan_values(df):
    fill_with = 0
    
    # Get the most common element by using size(),
    # which returns the element and how common it is
    for column in df:
        
        # Check if the column is an object, float64 or int64
        is_it_float = (df[column].dtype == np.float64)
        is_it_int = (df[column].dtype == np.int64)
        
        # If it is an object,
        # find the most common element and fill missing and NaN values
        if(not is_it_float and not is_it_int):
            fill_with = df[column].mode().item()
                    
        # If it is either a float64 or int64,
        # then calculate the mean and fill missing and NaN values
        else:
            if is_it_float:
                fill_with = np.nanmean(df[column], dtype=np.float64)
            if is_it_int:
                fill_with = np.nanmean(df[column], dtype=np.int64)
        
        # Fill the values in our dataset
        df[column] = df[column].fillna(fill_with)
        fill_with = 0

In [21]:
# Concatenate all of data
cc_data = pd.concat([train, test])
cc_data = cc_data.drop(['Id', 'SalePrice','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

# Get the SalePrice as the natural logarithm
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train['SalePrice']

# One-Hot encode all data
cc_data = pd.get_dummies(cc_data, prefix_sep='_')

# Fill missing or NaN values
fill_missing_or_nan_values(cc_data)

# Slice data, train.shape[0] is the observations
# 1) from start to middle of observations
# 2) from middle of observations to end
X_train = cc_data[:train.shape[0]]
X_test = cc_data[train.shape[0]:]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


# RANDOM FOREST WORK

In [7]:
# X is dataframe, y is output, m is how many features you want selected
# returns array of highest scoring features
def feature_selection(X, y, m):
    # Data is standardized here, minus mean and divided by standard deviation
    # The correlation between each regressor and the target is computed
    # It is converted to an F score then to a p-value, which is returned
    f_regression = lambda X,y : sklearn.feature_selection.f_regression(X,y,center=False)

    # removes all but the  highest scoring features
    featureSelector = SelectKBest(score_func=f_regression,k=m)
    featureSelector.fit(X,y)
    high_score_arr = [X.columns[1+zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))]
    
    return high_score_arr

selected_features = feature_selection(df_train,y,50)

In [11]:
# Returns the best scores from each k-fold
def random_forest_model(X, y):
    # Perform Grid-Search to find optimal parameters
    # Use RandomForestRegressor as the estimator,
    # Find optimal max_depth and n_estimators
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': [3, None],
            'n_estimators': (10, 20, 30, 40, 50, 100, 300, 500, 1000),
        },
        cv=5, scoring='neg_mean_squared_log_error', verbose=0, n_jobs=-1)
    
    # Actually run the model and get the best params
    model = gsc.fit(X, y)
    params = model.best_params_
    
    # Take the best params from Grid-Search and use here
    rfr = RandomForestRegressor(max_depth=params["max_depth"], n_estimators=params["n_estimators"],
                                random_state=False, verbose=False)
    
    # Perform K-Fold CV, to find RMSLE
    RMSLE_scores = cross_val_score(rfr, X, y, cv=10, n_jobs=-1, scoring="neg_mean_squared_log_error")
    
    return RMSLE_scores

scores = random_forest_model(df_train[selected_features],y)

In [12]:
# Root Mean Squared Logarithmic Error (RMSLE)
print("RMSLE: %.3f (%.3f)" % (scores.mean()*100.0*-1, scores.std()*100.0))

RMSLE: 0.014 (0.003)


In [33]:
def random_forest_model1(_Xtrain,_Xtest,y):
    gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': [3, None],
            'n_estimators': (10, 30, 50)
        }, cv=10, n_jobs=-1, scoring='neg_mean_squared_log_error'
    )
    model = gs.fit(_Xtrain,y)
    pred = model.predict(_Xtest)
    
    # return all predictions and mean of all cross validated scores
    return pred, -model.best_score_

y_pred,score = random_forest_model1(X_train, X_test, y)

In [34]:
print(np.exp(y_pred),score)

[126008.52041343 153244.62924091 183281.40511193 ... 157776.07019133
 111311.35136438 233282.04387426] 0.00012299529870924122


In [None]:
'''
# s can be array of functions for different models to run
def kfold_cv_models(k,s):
    best_models_test_error = 0

    for fold in range(k):
        # we have k partitions
        # let df_train and df_test be the next partition
        for model in range(s):
            # run next model with current k with df_train
            # get test error of df_test
            if this_models_test_error < best_models_test_error:
                best_models_test_error = this_models_test_error
    return best_models_test_error
'''

In [38]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [39]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 17622.44 degrees.


In [40]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 89.19 %.


In [14]:
test_size = 0.33
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(df_train[selected_features], y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)
model = LinearRegression() 
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result*100.0))

(978, 50)
(482, 50)
Accuracy: 88.473%


In [150]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, df_train, y, cv=kfold, scoring="neg_mean_squared_log_error")
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0*-1, results.std()*100.0))

Accuracy: 0.014% (0.007%)


In [16]:
regr = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=100)
regr.fit(train[selected_features], y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

print(regr.feature_importances_)

print(regr.predict(test[selected_features]))

[0.00000000e+00 1.30726717e-03 1.09002673e-02 8.22600933e-01
 0.00000000e+00 2.24175335e-04 3.38865375e-04 3.14993144e-03
 0.00000000e+00 1.27771514e-02 2.40628056e-02 4.82529222e-02
 0.00000000e+00 1.97217104e-04 0.00000000e+00 7.64012279e-03
 1.06137293e-02 2.36425450e-04 4.35963273e-02 1.13380047e-02
 1.55509357e-03 2.18581446e-04 0.00000000e+00 2.15142708e-04
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 5.07273835e-05 7.89680401e-05 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 4.47795587e-04 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.97546301e-04]


KeyError: "['Utilities_NoSeWa' 'RoofMatl_Membran' 'Heating_Floor'] not in index"