In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

## 数据集的准备

In [18]:
train = pd.read_csv('datas/house_data.csv')
y = train['SalePrice']
train.shape

(1460, 82)

In [20]:
train1 = train.drop(['Id', 'SalePrice'], axis=1)
train1.shape

(1460, 80)

In [19]:
# 变成one_hot形式，内容全部被数字化了,原特征删除
X = pd.get_dummies(train1).reset_index(drop=True)
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=123)

In [12]:
X_train.shape

(1168, 303)

In [13]:
X_test.shape

(292, 303)

<br><br>

## 基础线性回归

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error #方差

In [23]:
lm=LinearRegression()

In [24]:
lm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [25]:
pred=lm.predict(X_test)

In [26]:
np.sqrt(mean_squared_error(np.log(y_test), np.log(pred)))

0.12627809622157107

In [27]:
np.sqrt(mean_squared_error(y_test, pred))

24973.913406557556

In [28]:
def benchmark(model):
    pred = model.predict(X_test)
    # 方差
    logrmse = np.sqrt(mean_squared_error(np.log(y_test), np.log(pred)))
    return logrmse

In [29]:
benchmark(lm)

0.12627809622157107

## 数据预处理 Preprocessing

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

In [None]:
lm_model = make_pipeline(RobustScaler(), LinearRegression())

In [None]:
lm_model.fit(X_train, y_train)

In [None]:
# 评测模型
benchmark(lm_model)

## RidegeRegression

朴素的Ridge回归

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge_model = Ridge(alpha=0.1)

In [None]:
ridge_model.fit(X_train, y_train)

In [None]:
benchmark(ridge_model)

## 朴素的Ridge回归

In [None]:
ridge_model_pipe=make_pipeline(RobustScaler(), Ridge(alpha=0.1))

In [None]:
ridge_model_pipe.fit(X_train, y_train)

In [None]:
benchmark(ridge_model_pipe)

<br><br>

## 带有CV的回归

cross view data

In [None]:
from sklearn.model_selection import KFold

In [None]:
kfolds=KFold(n_splits=10, shuffle=True, random_state=123)

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
r_alphas=[0.01, 0.1, 1, 3, 5, 7, 10, 100]

In [None]:
ridge_model_cv=make_pipeline(RobustScaler(), RidgeCV(alphas=r_alphas, cv=kfolds))

In [None]:
# RidgeCV()

In [None]:
ridge_model_cv.fit(X_train, y_train)

In [None]:
benchmark(ridge_model_cv)

In [None]:
def benchmark2(model, X_test, y_test):
    pred=model.predict(X_test)
    if pred[pred<0.].shape[0]>0:
        print('Neg Value')
    rmse=np.sqrt(mean_squared_error(y_test, pred))
    lrmse = np.sqrt(mean_squared_error(np.log(y_test), np.log(pred)))
    
    print('RMSE:', rmse)
    print('LRMSE:', lrmse)

In [None]:
benchmark2(ridge_model_cv, X_test, y_test)

In [None]:
r_alphas=[.0001, .0003, .0005, .0007, .0009, .01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 20, 30, 50, 60, 70, 80]

In [None]:
def ridge_train_test(alpha):
    m = make_pipeline(RobustScaler(), RidgeCV(alphas=[alpha], cv=kfolds))
    m.fit(X_train, y_train)
    benchmark2(m, X_test, y_test)
    lrmse=np.sqrt(mean_squared_error(np.log(y_test),np.log(pred)))
    return lrmse

In [None]:
ridge_train_test(.0001)

In [None]:
# 要写很多很多野代码
scores=[]
for k in r_alphas:
    scores.append(ridge_train_test(k))

In [None]:
plt.plot(r_alphas, scores)