In [106]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import scipy.stats as stats

In [107]:
train_data = pd.read_csv('./kaggle_input/train.csv')
test_x = pd.read_csv('./kaggle_input/test.csv')

In [108]:
null_col = test_x.columns[test_x.isnull().any()]
test_x = test_x.drop(null_col, axis = 1)
train_data = train_data.drop(null_col, axis = 1)

In [109]:
train_x = train_data.iloc[:, :-1]
train_y = train_data["SalePrice"]

In [110]:
train_x_numeric = train_x.select_dtypes(include=['int64', 'float64'])
train_x_categorical = train_x.select_dtypes(include=['object'])

In [111]:
train_x_numeric = train_x_numeric.dropna(axis = 1)

In [112]:
r_vals = []
for col in train_x_numeric.columns:
    r_vals.append([col, stats.pearsonr(train_x_numeric[col], train_y)[0]**2])
r_vals = sorted(r_vals, key = lambda x:-x[1])
for x in r_vals:
    print(x)

['OverallQual', 0.625651892462126]
['GrLivArea', 0.502148650271803]
['1stFlrSF', 0.3670568696959659]
['FullBath', 0.3143438548592399]
['TotRmsAbvGrd', 0.2848604068044375]
['YearBuilt', 0.2734216207324904]
['YearRemodAdd', 0.25715139084530136]
['Fireplaces', 0.2180225385901365]
['WoodDeckSF', 0.10524408301655817]
['2ndFlrSF', 0.10197407763119197]
['OpenPorchSF', 0.09976515620798856]
['HalfBath', 0.08071717133187055]
['LotArea', 0.06961331538211156]
['BedroomAbvGr', 0.0282956652798042]
['KitchenAbvGr', 0.018470813449223335]
['EnclosedPorch', 0.016532291264409225]
['ScreenPorch', 0.012420338219512135]
['PoolArea', 0.008538415958697119]
['MSSubClass', 0.007103815434038222]
['OverallCond', 0.006061540238119059]
['MoSold', 0.00215595339652489]
['3SsnPorch', 0.001987703214770043]
['YrSold', 0.0008365159328424604]
['LowQualFinSF', 0.0006556738936116987]
['Id', 0.00048034259116207077]
['MiscVal', 0.0004489982853327445]


In [146]:
features = [r_vals[i][0] for i in range(7)]

In [147]:
#model = LinearRegression()
model = RandomForestRegressor(n_estimators = 1000)
model.fit(np.array(train_x[features]), np.array(train_y))
eval(model, np.array(train_x[features]), np.array(train_y))

(139243952.94706967, 0.9779215759236747)

In [148]:
test_y = model.predict(np.array(test_x[features]))

In [149]:
out_df = test_x[['Id']].copy()
out_df['SalePrice'] = np.array(test_y)

In [150]:
#outfile = open('./output.csv', 'w')
out_df.to_csv('output.csv', index=False)

In [151]:
def eval(model, train_x, train_y):
    pred = model.predict(train_x)
    mse = 1/len(train_y) * (sum((pred - train_y)**2))
    tot_var = np.var(train_y)
    return mse, 1 - mse/tot_var

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.hist(train_x["YearBuilt"])
ax2.hist(test_x["YearBuilt"]))