In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
import pickle
import csv
np.random.seed(42)
%matplotlib inline

# Load in cleaned train data and cleaned test data

In [54]:
df = pd.read_csv('../data/clean_data.csv', index_col = 'Id') 

In [55]:
test = pd.read_csv('../data/clean_data_test.csv' ,index_col = 'Id')

In [56]:
df.head()

Unnamed: 0_level_0,PID,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,Yr Sold_2010,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,RL,0.0,13517,Pave,No Alley,IR1,Lvl,AllPub,CulDSac,...,1,0,0,0,0,0,0,0,0,1
544,531379050,RL,43.0,11492,Pave,No Alley,IR1,Lvl,AllPub,CulDSac,...,0,0,0,0,0,0,0,0,0,1
153,535304180,RL,68.0,7922,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,1,0,0,0,0,0,0,0,0,1
318,916386060,RL,73.0,9802,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,1,0,0,0,0,0,0,0,0,1
255,906425045,RL,82.0,14235,Pave,No Alley,IR1,Lvl,AllPub,Inside,...,1,0,0,0,0,0,0,0,0,1


In [57]:
test.head()

Unnamed: 0_level_0,PID,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,902301120,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,1
2718,905108090,RL,0.0,9662,Pave,No Alley,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,1
2414,528218130,RL,58.0,17104,Pave,No Alley,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,0,1,0,0,0
1989,902207150,RM,60.0,8520,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,1
625,535105100,RL,0.0,9500,Pave,No Alley,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,0,0,0,0,1


# Creating a function that spits out a set difference

In [58]:
def Diff(li1, li2):
    return (list(set(li1) - set(li2)))

Creating columns in df found in test data but not found in train data

In [59]:
for x in Diff(list(test.columns), list(df.columns)):
    df[x] = 0

Creating columns in test dataframe found in train data but not in test data

In [60]:
for x in Diff(list(df.columns), list(test.columns)):
    if x != 'SalePrice':
        test[x] = 0

In [61]:
test.shape

(879, 431)

In [62]:
df.shape

(2051, 432)

# Setting my X and y

In [63]:
X = df.drop('SalePrice', 1)
y = df['SalePrice']

# Train test split

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Setting and fitting a variance threshold

The variance threshold gets rid of columns with low variance, helping us get rid of features

In [65]:
# Variance Threshold
threshold = VarianceThreshold(0.05)
X_train_var = threshold.fit_transform(X_train.select_dtypes(exclude = 'object'))
X_train.select_dtypes(exclude='object').columns[threshold.get_support()]

Index(['PID', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'Bsmt Unf SF',
       'Total Bsmt SF',
       ...
       'Mo Sold_7', 'Mo Sold_8', 'Mo Sold_10', 'Yr Sold_2006', 'Yr Sold_2007',
       'Yr Sold_2008', 'Yr Sold_2009', 'Yr Sold_2010', 'Sale Type_New',
       'Sale Type_WD '],
      dtype='object', length=158)

In [66]:
# Variance Threshold
X_test_var = threshold.transform(X_test.select_dtypes(exclude='object'))

# Scaling my data

In [67]:
ss = StandardScaler()

In [68]:
X_train_sc = ss.fit_transform(X_train_var)
X_test_sc = ss.transform(X_test_var)

# Exporting my data and scaler for future use

In [69]:
with open('../data/X_train_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_train_sc)

In [70]:
with open('../data/X_test_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_test_sc)

In [71]:
with open('../data/X_train.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_train_var)

In [72]:
with open('../data/X_test.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_test_var)

In [73]:
y_train.to_csv('../data/y_train.csv', index=False, header=False)

In [74]:
y_test.to_csv('../data/y_test.csv', index=False, header=False)

In [75]:
with open('../assets/scaler.pkl', 'wb+') as f:
    pickle.dump(ss, f)

# Exporting my columns for future use

In [76]:
with open('../assets/columns.pkl', 'wb+') as f:
    pickle.dump(list(X_train.select_dtypes(exclude = 'object').columns[threshold.get_support()]), f)

In [77]:
test.to_csv('../data/kaggle.csv')

In [78]:
df.to_csv('../data/df_clean_final.csv')