In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [3]:
houses = pd.read_csv('AmesHousing.tsv', delimiter = '\t')

In [61]:
def transform_features(df):
    unnecessary_columns = ['Order', 'PID']
    
    # removing columns with many nulls
    size = houses.shape[0]
    null_values = houses.isnull().sum()
    selected_columns = list(null_values[null_values < size*0.05].index)
    df_clean = df[selected_columns].copy()
    
    # filling num columns with 
    num_cols = df_clean.select_dtypes(['float64','int32']).columns
    df_clean[num_cols] = df_clean[num_cols].fillna(df_clean[num_cols].mode().loc[0])
    
    # removing unnecessary columns
    df_clean.drop(unnecessary_columns, axis = 1, inplace = True)
        
    # geting dummies
    object_columns = df_clean.select_dtypes('object').columns
    df_clean = pd.concat([df_clean, pd.get_dummies(df_clean[object_columns])], axis = 1)
    df_clean.drop(object_columns, axis = 1, inplace = True)
        
    df_clean['years_until_remod'] = df_clean['Year Remod/Add'] - df_clean['Year Built']
    
    return df_clean

def select_features(df):
    df_sorted = df.corr()['SalePrice'].abs().sort_values().copy()
    columns_selected = df_sorted[df_sorted > 0.4].index
    return columns_selected

def train_and_test(df, k = 0):
    df_transformed = transform_features(df)
    
    features = select_features(df_transformed)
    features = list(features.drop('SalePrice'))
    
    if k == 0:
        train = df_transformed.iloc[:1460]
        test = df_transformed.iloc[1460:]
        
        lr = LinearRegression()
        lr.fit(train[features], train['SalePrice'])
        predict = lr.predict(test[features])
        rmse = mean_squared_error(predict, test['SalePrice'])**0.5
        return rmse
    else:
        kf = KFold(n_splits = k, shuffle = True)
        rmse = list()
        
        for train_index, test_index in kf.split(df):
            train = df_transformed.iloc[train_index]
            test = df_transformed.iloc[test_index]
            
            lr = LinearRegression()
            lr.fit(train[features], train['SalePrice'])
            predict = lr.predict(test[features])
            rmse.append(mean_squared_error(predict, test['SalePrice'])**0.5)
        return rmse
    



In [62]:
train_and_test(houses, k = 4)

[46153.69522499673, 27617.350255322803, 29637.192336036835, 27315.015542772904]