# PREVIOUS WORK

In [222]:
# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from scipy.linalg import svd
import sklearn
from sklearn.feature_selection import SelectKBest

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

#*** GET DATA AND DROP DATA ***

# Set the data equal to some variables so we can use them later
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

# Reset the index, but drop the column for the index, and modify the dataframe inplace
train.reset_index(drop=True, inplace=True)
# Get the SalePrice as the natural logarithm
train["SalePrice"] = np.log1p(train["SalePrice"])
# Set y = that new logarithmic SalePrice, then reset_index and drop column
y = train['SalePrice'].reset_index(drop=True)

# Remove feautres with missing data, Id and SalePrice (y)
df_train = train.drop(['SalePrice', 'Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
df_test = test.drop(['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
#features = pd.concat([df_train, df_test]).reset_index(drop=True)

In [223]:
#*** FILL VALUES, returns df with no missing data or NaN values ***
def fill_missing_or_nan_values(df):
    fill_with = 0
    
    # Get the most common element by using size(),
    # which returns the element and how common it is
    for column in df:
        
        # Check if the column is an object, float64 or int64
        is_it_float = (df[column].dtype == np.float64)
        is_it_int = (df[column].dtype == np.int64)
        
        # If it is an object,
        # find the most common element and fill missing and NaN values
        if(not is_it_float and not is_it_int):
            fill_with = df[column].mode().item()
                    
        # If it is either a float64 or int64,
        # then calculate the mean and fill missing and NaN values
        else:
            if is_it_float:
                fill_with = np.nanmean(df[column], dtype=np.float64)
            if is_it_int:
                fill_with = np.nanmean(df[column], dtype=np.int64)
        
        # Fill the values in our dataset
        df[column] = df[column].fillna(fill_with)
        fill_with = 0

fill_missing_or_nan_values(df_train)
fill_missing_or_nan_values(df_test)
df_train = pd.get_dummies(df_train, prefix_sep='_')
df_test = pd.get_dummies(df_test, prefix_sep='_')

# RANDOM FOREST WORK

In [224]:
# X is dataframe, y is output, m is how many features you want selected
# returns array of highest scoring features
def feature_selection(X, y, m):
    # Data is standardized here, minus mean and divided by standard deviation
    # The correlation between each regressor and the target is computed
    # It is converted to an F score then to a p-value, which is returned
    f_regression = lambda X,y : sklearn.feature_selection.f_regression(X,y,center=False)

    # removes all but the  highest scoring features
    featureSelector = SelectKBest(score_func=f_regression,k=m)
    featureSelector.fit(X,y)
    high_score_arr = [X.columns[1+zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))]
    
    return high_score_arr

selected_features = feature_selection(df_train,y,50)

In [225]:
def rfr_model(X, y):
    # Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': [3, None],
            'n_estimators': (10, 20, 30, 40, 50, 100, 300, 500, 1000),
        },
        cv=5, scoring='neg_mean_squared_log_error', verbose=0, n_jobs=-1)
    
    model = gsc.fit(X, y)
    params = model.best_params_
    
    rfr = RandomForestRegressor(max_depth=params["max_depth"], n_estimators=params["n_estimators"],
                                random_state=False, verbose=False)
    
    # Perform K-Fold CV
    RMSLE_scores = cross_val_score(rfr, X, y, cv=10, scoring="neg_mean_squared_log_error")
    
    return RMSLE_scores

scores = rfr_model(df_train[selected_features],y)

In [228]:
# Root Mean Squared Logarithmic Error (RMSLE)
print("RMSLE: %.3f (%.3f)" % (scores.mean()*100.0*-1, scores.std()*100.0))

RMSLE: 0.013 (0.003)


In [None]:
'''
# s can be array of functions for different models to run
def kfold_cv_models(k,s):
    best_models_test_error = 0

    for fold in range(k):
        # we have k partitions
        # let df_train and df_test be the next partition
        for model in range(s):
            # run next model with current k with df_train
            # get test error of df_test
            if this_models_test_error < best_models_test_error:
                best_models_test_error = this_models_test_error
    return best_models_test_error
'''

In [38]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [39]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 17622.44 degrees.


In [40]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 89.19 %.


In [80]:
test_size = 0.33
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(train[selected_features], y, test_size=test_size, random_state=seed)
model = LinearRegression() 
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result*100.0))

(978,)
Accuracy: 83.038%


In [150]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, df_train, y, cv=kfold, scoring="neg_mean_squared_log_error")
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0*-1, results.std()*100.0))

Accuracy: 0.014% (0.007%)


In [16]:
regr = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=100)
regr.fit(train[selected_features], y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

print(regr.feature_importances_)

print(regr.predict(test[selected_features]))

[0.00000000e+00 1.30726717e-03 1.09002673e-02 8.22600933e-01
 0.00000000e+00 2.24175335e-04 3.38865375e-04 3.14993144e-03
 0.00000000e+00 1.27771514e-02 2.40628056e-02 4.82529222e-02
 0.00000000e+00 1.97217104e-04 0.00000000e+00 7.64012279e-03
 1.06137293e-02 2.36425450e-04 4.35963273e-02 1.13380047e-02
 1.55509357e-03 2.18581446e-04 0.00000000e+00 2.15142708e-04
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 5.07273835e-05 7.89680401e-05 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 4.47795587e-04 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.97546301e-04]


KeyError: "['Utilities_NoSeWa' 'RoofMatl_Membran' 'Heating_Floor'] not in index"