In [46]:
import sys

import sklearn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_columns', None)

In [47]:
# read the data

train = pd.read_csv('data/train.csv', index_col=['Id'])
test = pd.read_csv('data/test.csv', index_col=['Id'])

In [48]:
from sklearn.model_selection import train_test_split

def split_data_by(data, target):

    # split the dataset to with and without labels
    train_no_sale_price = data.drop(labels=[target], axis=1)
    train_sale_price = data[target]

    # split the train set to 70/30 train/test
    return train_test_split(train_no_sale_price, 
                            train_sale_price, 
                            test_size=0.3, 
                            random_state=4330)

x_train, x_test, y_train, y_test = split_data(train, 'SalePrice')

# Make sure correct data fell int the correct variables
for df in [x_train, x_test, y_train, y_test]:
    print(df.shape)

TypeError: split_data() takes 1 positional argument but 2 were given

In [49]:
numeric_column_names = x_train.describe().columns
x_train[numeric_column_names] = x_train[numeric_column_names].fillna(0)
x_test[numeric_column_names] = x_test[numeric_column_names].fillna(0)

In [50]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

categorical_columns = train.dtypes[train.dtypes == 'object'].index
train = train.fillna("")

In [51]:
def encode_categorical(data, columns, encoders):
    data = data.fillna("")
    return pd.DataFrame({col: encoders[col].transform(data[col]) for col in columns},
                        index = data.index)
train_encoded = encode_categorical(train, categorical_columns, encoders)

one_hot_encoder = OneHotEncoder().fit(train_encoded)
one_hot_x_train = one_hot_encoder.transform(encode_categorical(x_train[categorical_columns], categorical_columns, encoders))
one_hot_x_test = one_hot_encoder.transform(encode_categorical(x_test[categorical_columns], categorical_columns, encoders))

In [52]:
new_x_train = pd.np.concatenate([one_hot_x_train.todense(), x_train[numeric_column_names]], axis=1)
new_x_test = pd.np.concatenate([one_hot_x_test.todense(), x_test[numeric_column_names]], axis=1)

In [53]:
# regularization
from sklearn.linear_model import Ridge

alpha = [0.01, 0.1, 1, 10, 100]

for a in alpha:
    all_data_lr = Ridge(alpha = a).fit(new_x_train, y_train)
    print('alpha:', a)
    print(all_data_lr.score(new_x_train, y_train), all_data_lr.score(new_x_test, y_test))
    print()

alpha: 0.01
0.9374607177132871 0.848821474014292

alpha: 0.1
0.9364454642419521 0.8596282428267933

alpha: 1
0.9227360656014941 0.8824267159987822

alpha: 10
0.8933107135132801 0.8922404784820502

alpha: 100
0.8524424288746798 0.882169210027329



In [54]:
# let's fix null columns better
# read the data

train = pd.read_csv('data/train.csv', index_col=['Id'])

nulls = train[numeric_column_names].isnull().sum()
nulls[nulls > 0]

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

In [56]:
# LotFrontage
train = train[pd.notnull]
x_train, x_test, y_train, y_test = split_data_by(train, 'LotFrontage')
for df in [x_train, x_test, y_train, y_test]:
    print(df.shape)

(1022, 79)
(438, 79)
(1022,)
(438,)


In [58]:
numeric_column_names = x_train.describe().columns
print(numeric_column_names)

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')


In [59]:
print(y_test)

Id
479      79.0
1129     59.0
385       NaN
1123      NaN
1217     68.0
1426     80.0
1419     71.0
19       66.0
245       NaN
813      66.0
1141     60.0
483      50.0
12       85.0
1157     85.0
526      62.0
1440     80.0
374      79.0
375      65.0
300      80.0
181       NaN
66       76.0
471       NaN
352       NaN
443      52.0
224      70.0
718      80.0
561       NaN
96        NaN
1377     52.0
564      66.0
        ...  
651      65.0
11       70.0
191      70.0
844      80.0
1147      NaN
501      21.0
1111      NaN
338      70.0
1155      NaN
235       NaN
1398     51.0
976       NaN
1134     80.0
225     103.0
121       NaN
974      95.0
198     174.0
649      70.0
233      21.0
289       NaN
670      80.0
717      60.0
901       NaN
272      73.0
91       60.0
648      85.0
184      63.0
800      60.0
365       NaN
1175     80.0
Name: LotFrontage, Length: 438, dtype: float64
