# PREVIOUS WORK

In [32]:
# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from scipy.linalg import svd
import sklearn
from sklearn.feature_selection import SelectKBest

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

#*** GET DATA AND DROP DATA ***

# Set the data equal to some variables so we can use them later
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

# We remove the features Alley, FireplaceQu, PoolQC, Fence and MiscFeature from the dataset
# Because of too many missing values
# Drop SalePrice but store it as output variable (Y)
train = train.drop(["Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"], axis=1)

#*** FILL VALUES ***
# df is used as stand-in name for train
# df is used as stand-in name for train
def fill_missing_or_nan_values(df):
    fill_with = 0
    
    # Get the most common element by using size(),
    # which returns the element and how common it is
    for column in df:
        
        # Check if the column is an object, float64 or int64
        is_it_float = (df[column].dtype == np.float64)
        is_it_int = (df[column].dtype == np.int64)
        
        # If it is an object,
        # find the most common element and fill missing and NaN values
        if(not is_it_float and not is_it_int):
            fill_with = df[column].mode().item()
                    
        # If it is either a float64 or int64,
        # then calculate the mean and fill missing and NaN values
        else:
            if is_it_float:
                fill_with = np.nanmean(df[column], dtype=np.float64)
            if is_it_int:
                fill_with = np.nanmean(df[column], dtype=np.int64)
        
        # Fill the values in our dataset
        df[column] = df[column].fillna(fill_with)
        fill_with = 0

fill_missing_or_nan_values(train)
fill_missing_or_nan_values(test)
#*** ENCODING ***
train = pd.get_dummies(train, prefix_sep='_')
test = pd.get_dummies(test, prefix_sep='_')


# RANDOM FOREST WORK

In [33]:
#DROP SALEPRICE
y = train["SalePrice"]
train = train.drop(["SalePrice"], axis=1)

In [34]:
# X is dataframe, y is output, m is how many features you want selected
# returns array of highest scoring features
def feature_selection(X, y, m):
    # Data is standardized here, minus mean and divided by standard deviation
    # The correlation between each regressor and the target is computed
    # It is converted to an F score then to a p-value, which is returned
    f_regression = lambda X,y : sklearn.feature_selection.f_regression(X,y,center=False)

    # removes all but the  highest scoring features
    featureSelector = SelectKBest(score_func=f_regression,k=m)
    featureSelector.fit(X,y)
    high_score_arr = [X.columns[1+zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))]
    
    return high_score_arr

selected_features = feature_selection(train,y,10)

In [35]:
test_size = 0.33
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(train[selected_features], y, test_size=test_size, random_state=seed)
model = LinearRegression() 
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: %.3f%%" % (result*100.0))

Accuracy: 74.113%


In [27]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, train[selected_features], y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 71.689% (8.617%)


In [44]:
regr = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=100)
regr.fit(train[selected_features], y)
RandomForestRegressor(bootstrap=True, criterion='accuracy', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

print(regr.feature_importances_)

print(regr.predict(test[selected_features]))

[5.33488286e-04 6.61932889e-02 1.30820584e-01 5.25541825e-02
 1.06615899e-07 4.06440575e-03 1.85185095e-02 7.26518039e-01
 2.09855991e-04 5.87539531e-04]
[120860.6003609  130310.19382365 185475.66433872 ... 184958.72481809
 122395.79459593 289686.33354342]
