In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
import pickle
import csv
np.random.seed(42)
%matplotlib inline

# Load in cleaned train data and cleaned test data

In [2]:
df = pd.read_csv('../data/clean_data.csv', index_col = 'Id') 

In [3]:
test = pd.read_csv('../data/clean_data_test.csv' ,index_col = 'Id')

In [4]:
df.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Yr Sold_2010,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,60,RL,0.0,13517,Pave,No Alley,IR1,Lvl,AllPub,...,1,0,0,0,0,0,0,0,0,1
544,531379050,60,RL,43.0,11492,Pave,No Alley,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1
153,535304180,20,RL,68.0,7922,Pave,No Alley,Reg,Lvl,AllPub,...,1,0,0,0,0,0,0,0,0,1
318,916386060,60,RL,73.0,9802,Pave,No Alley,Reg,Lvl,AllPub,...,1,0,0,0,0,0,0,0,0,1
255,906425045,50,RL,82.0,14235,Pave,No Alley,IR1,Lvl,AllPub,...,1,0,0,0,0,0,0,0,0,1


In [5]:
test.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1
2718,905108090,90,RL,0.0,9662,Pave,No Alley,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1
2414,528218130,60,RL,58.0,17104,Pave,No Alley,IR1,Lvl,AllPub,...,0,0,0,0,0,0,1,0,0,0
1989,902207150,30,RM,60.0,8520,Pave,No Alley,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1
625,535105100,20,RL,0.0,9500,Pave,No Alley,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1


# Creating a function that spits out a set difference

In [6]:
def Diff(li1, li2):
    return (list(set(li1) - set(li2)))

Creating columns in df found in test data but not found in train data

In [7]:
for x in Diff(list(test.columns), list(df.columns)):
    df[x] = 0

Creating columns in test dataframe found in train data but not in test data

In [8]:
for x in Diff(list(df.columns), list(test.columns)):
    if x != 'SalePrice':
        test[x] = 0

In [9]:
test.shape

(879, 445)

In [10]:
df.shape

(2051, 446)

# Setting my X and y

In [11]:
X = df.drop('SalePrice', 1)
y = df['SalePrice']

# Train test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Dropping columns from train and test that are multicolinear with other columns

Total Bsmt SF is multicolinear with BsmtFin SF 1, and I split Overall Qual into categories along with Overall Cond

In [13]:
X_train.drop(['BsmtFin SF 1', 'Overall Qual','Overall Cond'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [14]:
X_test.drop(['BsmtFin SF 1', 'Overall Qual','Overall Cond'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


# Scaling my data

In [15]:
ss = StandardScaler()

In [16]:
X_train_sc = ss.fit_transform(X_train.select_dtypes(exclude = 'object'))
X_test_sc = ss.transform(X_test.select_dtypes(exclude = 'object'))

# Exporting my data and scaler for future use

In [17]:
with open('../data/X_train_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_train_sc)

In [18]:
with open('../data/X_test_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_test_sc)

In [19]:
with open('../data/X_train.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_train)

In [20]:
X_train.to_csv('../data/X_train.csv', index=False, header=False)

In [21]:
X_test.to_csv('../data/X_test.csv', index=False, header=False)

In [22]:
y_train.to_csv('../data/y_train.csv', index=False, header=False)

In [23]:
y_test.to_csv('../data/y_test.csv', index=False, header=False)

In [24]:
with open('../assets/scaler.pkl', 'wb+') as f:
    pickle.dump(ss, f)

# Exporting my columns for future use

In [25]:
with open('../assets/columns.pkl', 'wb+') as f:
    pickle.dump(list(X_train.select_dtypes(exclude = 'object').columns), f)

In [26]:
test.to_csv('../data/kaggle.csv')

In [27]:
df.to_csv('../data/df_clean_final.csv')