In [1]:
import pandas as pd
import numpy as np
import missingno as msno
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest

# Read data from dataset
data = pd.read_csv('train.csv')
data = data.drop('Id', axis=1)

# Encode categorical features
object_cols = data.select_dtypes(include='object').columns.tolist()
for feature in object_cols:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])

# Check missing values
nullCols = pd.DataFrame(data.isnull().sum())
for col in nullCols.index:
    if nullCols.loc[col][0] > 0:
        data[col]  = data[col].fillna(data[col].median())

# Check outliers
iso_forest = IsolationForest(n_estimators=500, contamination=0.01)
for feature in data.columns:
    iso_forest.fit(data[feature].values.reshape(-1, 1))
    mask = iso_forest.predict(data[feature].values.reshape(-1, 1)) == -1
    data.loc[mask, feature] = data[feature].median()


In [2]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Correlation matrix
corr_matrix = data.corr()
corr_values = corr_matrix['SalePrice'].sort_values(ascending=False)

X_train = data.copy()

# Drop features with correlation less than 0.5
for i in corr_values.index:
    if corr_values[i] > -0.25 and corr_values[i] < 0.25:
        X_train.drop(i, axis=1, inplace=True)  

y_train = data['SalePrice']
X_train.drop('SalePrice', axis=1, inplace=True)

# Scale data
xscaler = MinMaxScaler(feature_range=(-1, 1))
yscaler = MinMaxScaler(feature_range=(-1, 1))

X_train = xscaler.fit_transform(X_train)

y_train = y_train.values.reshape(-1, 1)
y_train = yscaler.fit_transform(y_train)

In [30]:
from sklearn.linear_model import Ridge, ElasticNet

# Choose and train model
ridge = ElasticNet()
ridge.fit(X_train, y_train)

In [31]:
from sklearn.metrics import mean_squared_error

# Predict and evaluate
y_pred = ridge.predict(X_train)

print('MSE: ', mean_squared_error(y_train, y_pred))

y_predE = yscaler.inverse_transform(np.array([y_pred]))
y_trainE = yscaler.inverse_transform(y_train) 

print('MSE: ', mean_squared_error(y_trainE, y_predE[0])/1e6)



MSE:  0.11359729437272961
MSE:  4868.706269572158


In [5]:
# Predict test data
data_test = pd.read_csv('test.csv')
data_test = data_test.drop('Id', axis=1)
data_test_benchmark = pd.read_csv('sample_submission.csv')

object_cols = data_test.select_dtypes(include='object').columns.tolist()
for feature in object_cols:
    le = LabelEncoder()
    data_test[feature] = le.fit_transform(data_test[feature])

nullCols = pd.DataFrame(data_test.isnull().sum())
for col in nullCols.index:
    if nullCols.loc[col][0] > 0:
        data_test[col]  = data_test[col].fillna(data_test[col].median())        

iso_forest = IsolationForest(n_estimators=500, contamination=0.01)

for feature in data_test.columns:
    iso_forest.fit(data_test[feature].values.reshape(-1, 1))
    mask = iso_forest.predict(data_test[feature].values.reshape(-1, 1)) == -1
    data_test.loc[mask, feature] = data_test[feature].median()

In [32]:
X_test = data_test.copy()
y_test = data_test_benchmark['SalePrice']

for i in corr_values.index:
    if corr_values[i] > -0.25 and corr_values[i] < 0.25:
        X_test.drop(i, axis=1, inplace=True)

X_test = xscaler.fit_transform(X_test)

In [33]:
ppred = ridge.predict(X_test)
ppred = yscaler.inverse_transform(np.array([ppred]))

print('MSE: ', mean_squared_error(y_test, ppred[0])/1e6)

MSE:  275.48625541774044
