In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, LinearRegression,LassoCV
from sklearn.metrics import mean_squared_error
#from sklearn.cross_validation import KFold
plt.style.use('ggplot')
%matplotlib inline

In [18]:
df=pd.read_csv(r"C:\nydsa bootcamp slides\Projects\3\git\machine-learning_house-prices\Chaoran\AmesDummiesOrdinal.csv",
               index_col=0)

In [19]:
df.shape

(1458, 186)

In [4]:
#SalePrice as output, all other variables as input
SalePrice=df.SalePrice
lasso_df=df.drop('SalePrice',axis=1)

# Split the Ames_df into a 80% train-set and a 20% test set. 
# I tested a few seed set, the seed19 gives the best result when consider about all (alpha value, score, RMSE) together
# with differet seed (different data split way) the lambda will change , and the result will be slight different
# I think this is due to some outlier nodes of some features. Once we clean the data set better we should get more stable result

np.random.seed(19)
testIdxes = np.random.choice(range(1458), size= 292, replace=False)
trainIdxes = list(set(range(1458))-set(testIdxes))
trainX = lasso_df.iloc[trainIdxes]
testX  = lasso_df.iloc[testIdxes]
trainY = SalePrice.iloc[trainIdxes]
testY  = SalePrice.iloc[testIdxes]

# Use np.linspace to generate 40 alpha between 0.001 and 200
alphaRange = np.linspace(1e-3,200,40) 

### Start with finding the best Lambda , still use cross validation

In [5]:
lasso_cv=LassoCV(alphas=alphaRange,cv=10)
lasso_cv.fit(trainX,trainY)
lasso_best_alpha=round(lasso_cv.alpha_,3)



In [6]:
# with KFold = 10, the best alpha shows as 107.693
lasso_best_alpha

66.667

In [7]:
lasso_cv.score(trainX,trainY)

0.9185382606328704

In [8]:
# build lasso model
lasso=Lasso(alpha=lasso_best_alpha)
lasso.fit(trainX,trainY)

Lasso(alpha=66.667, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
lasso_inte=lasso.intercept_
lasso_inte

-154670.2574514244

In [10]:
lasso_coef=pd.Series(lasso.coef_, index=lasso_df.columns)
lasso_coef_sorted=np.abs(lasso_coef).sort_values(ascending=False)
zero_coef=lasso_coef_sorted[lasso_coef_sorted.values==0]

In [11]:
len(zero_coef)

79

### with Seed 19, 104 columns can be removed (coef=0)

In [12]:
lasso_predict=lasso.predict(testX)
RMSE=np.sqrt(mean_squared_error(testY,lasso_predict))
RMSE

22497.227012275238

In [13]:
#save the list of coef = 0 to zero_coef_lasso.csv file
zero_coef.to_csv("zero_coef_lasso.csv")

In [14]:
#drop the 93 columns to get a sub df, check the shape to make sure the column num looks good
leftcol=df.drop(list(zero_coef.index),axis=1)

In [15]:
leftcol.shape

(1458, 107)

In [16]:
df.shape

(1458, 186)

In [20]:
leftcol.to_csv("lasso.csv")