In [132]:
import sys

import sklearn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_columns', None)

In [133]:
# read the data

train = pd.read_csv('data/train.csv', index_col=['Id'])
test = pd.read_csv('data/test.csv', index_col=['Id'])

In [134]:
from sklearn.model_selection import train_test_split

def split_data_by(data, target):

    # split the dataset to with and without labels
    train_no_sale_price = data.drop(labels=[target], axis=1)
    train_sale_price = data[target]

    # split the train set to 70/30 train/test
    return train_test_split(train_no_sale_price, 
                            train_sale_price, 
                            test_size=0.3, 
                            random_state=4330)

x_train, x_test, y_train, y_test = split_data_by(train, 'SalePrice')

# Make sure correct data fell int the correct variables
for df in [x_train, x_test, y_train, y_test]:
    print(df.shape)

(1022, 79)
(438, 79)
(1022,)
(438,)


In [135]:
numeric_column_names = x_train.describe().columns
x_train[numeric_column_names] = x_train[numeric_column_names].fillna(0)
x_test[numeric_column_names] = x_test[numeric_column_names].fillna(0)

In [77]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

categorical_columns = train.dtypes[train.dtypes == 'object'].index
train = train.fillna("")
encoders = {col: LabelEncoder().fit(train[col]) for col in categorical_columns}

In [78]:
def encode_categorical(data, columns, encoders):
    data = data.fillna("")
    return pd.DataFrame({col: encoders[col].transform(data[col]) for col in columns},
                        index = data.index)
train_encoded = encode_categorical(train, categorical_columns, encoders)

one_hot_encoder = OneHotEncoder().fit(train_encoded)
one_hot_x_train = one_hot_encoder.transform(encode_categorical(x_train[categorical_columns], categorical_columns, encoders))
one_hot_x_test = one_hot_encoder.transform(encode_categorical(x_test[categorical_columns], categorical_columns, encoders))

In [79]:
new_x_train = pd.np.concatenate([one_hot_x_train.todense(), x_train[numeric_column_names]], axis=1)
new_x_test = pd.np.concatenate([one_hot_x_test.todense(), x_test[numeric_column_names]], axis=1)

In [80]:
# regularization
from sklearn.linear_model import Ridge

alpha = [0.01, 0.1, 1, 10, 100]

for a in alpha:
    all_data_lr = Ridge(alpha = a).fit(new_x_train, y_train)
    print('alpha:', a)
    print(all_data_lr.score(new_x_train, y_train), all_data_lr.score(new_x_test, y_test))
    print()

alpha: 0.01
0.9374607177132871 0.848821474014292

alpha: 0.1
0.9364454642419521 0.8596282428267933

alpha: 1
0.9227360656014941 0.8824267159987822

alpha: 10
0.8933107135132801 0.8922404784820502

alpha: 100
0.8524424288746798 0.882169210027329



In [156]:
# let's fix null columns better
# read the data

train_vnrarea = pd.read_csv('data/train.csv', index_col=['Id'])
numeric_column_names_vnrarea = train_vnrarea.describe().columns

nulls = train_vnrarea[numeric_column_names_vnrarea].isnull().sum()
nulls[nulls > 0]

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

In [157]:
# LotFrontage
train_vnrarea = train_vnrarea[pd.notnull(train_vnrarea['MasVnrArea'])]
nulls = train_vnrarea[numeric_column_names_vnrarea].isnull().sum()
nulls[nulls > 0]

LotFrontage    257
GarageYrBlt     81
dtype: int64

In [158]:
x_train_vnrarea, x_test_vnrarea, y_train_vnrarea, y_test_vnrarea = split_data_by(train_vnrarea, 'MasVnrArea')
for df in [x_train_vnrarea, x_test_vnrarea, y_train_vnrarea, y_test_vnrarea]:
    print(df.shape)

(1016, 79)
(436, 79)
(1016,)
(436,)


In [159]:
numeric_column_names_vnrarea = x_train_vnrarea.describe().columns

x_train_vnrarea[numeric_column_names_vnrarea] = x_train_vnrarea[numeric_column_names_vnrarea].fillna(0)
x_test_vnrarea[numeric_column_names_vnrarea] = x_test_vnrarea[numeric_column_names_vnrarea].fillna(0)

categorical_columns_vnrarea = train_vnrarea.dtypes[train.dtypes == 'object'].index
train_vnrarea = train_vnrarea.fillna("")
encoders_vnrarea = {col: LabelEncoder().fit(train_vnrarea[col]) for col in categorical_columns_vnrarea}

train_encoded_vnrarea = encode_categorical(train_vnrarea, categorical_columns_vnrarea, encoders_vnrarea)

one_hot_encoder_vnrarea = OneHotEncoder().fit(train_encoded_vnrarea)
one_hot_x_train_vnrarea = one_hot_encoder_vnrarea.transform(encode_categorical(x_train_vnrarea[categorical_columns_vnrarea], categorical_columns_vnrarea, encoders_vnrarea))
one_hot_x_test_vnrarea = one_hot_encoder_vnrarea.transform(encode_categorical(x_test_vnrarea[categorical_columns_vnrarea], categorical_columns_vnrarea, encoders_vnrarea))

new_x_train_vnrarea = pd.np.concatenate([one_hot_x_train_vnrarea.todense(), x_train_vnrarea[numeric_column_names_vnrarea]], axis=1)
new_x_test_vnrarea = pd.np.concatenate([one_hot_x_test_vnrarea.todense(), x_test_vnrarea[numeric_column_names_vnrarea]], axis=1)

print(new_x_train_vnrarea.shape, new_x_test_vnrarea.shape)

(1016, 303) (436, 303)


In [160]:
# regularization
from sklearn.linear_model import Ridge

alpha = [0.01, 0.1, 1, 10, 100]

for a in alpha:
    all_data_lr = Ridge(alpha = a).fit(new_x_train_vnrarea, pd.np.log10(y_train_vnrarea))
    print('alpha:', a)
    print(all_data_lr.score(new_x_train_vnrarea, y_train_vnrarea), all_data_lr.score(new_x_test_vnrarea, pd.np.log10(y_test_vnrarea)))
    print()
    
r = Ridge(alpha = 10).fit(new_x_train_vnrarea, y_train_vnrarea)
r.predict(new_x_test_vnrarea).shape

  import sys


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [116]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(new_x_train, y_train)

In [117]:
lr.score(new_x_test, y_test)

-1331949095673.0933