In [1]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,\
LogisticRegression,ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing,\
fetch_20newsgroups_vectorized
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [2]:
import numpy as np
import pandas as pd

In [3]:
def evalue(model,y_test,x_test):
    y_predict=model.predict(x_test)
    mse=mean_squared_error(y_test,y_predict)
    mae=mean_absolute_error(y_test,y_predict)
    score=model.score(x_test,y_test)
    r2score=r2_score(y_test,y_predict)
    print("mse:{} \t mae:{}\t score:{}\t r2_score:{}\n".format(mse,mae,score,r2score))

## 导入房价数据

In [4]:
data=fetch_california_housing()
data_x=data.data
data_y=data.target
feature=data.feature_names

In [5]:
x_train,x_test,y_train,y_test=train_test_split(\
                        data_x,data_y,test_size=0.2,random_state=1)

In [6]:
show_data=pd.DataFrame(data_x,columns=feature)

In [7]:
show_data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


## 定义模型

In [8]:
lr=LinearRegression()
l_ridge=Ridge(alpha=1.0)
l_lasso=Lasso(alpha=0.1)
l_elas=ElasticNet(random_state=0)


In [9]:
lr.fit(x_train,y_train)
l_ridge.fit(x_train,y_train)
l_lasso.fit(x_train,y_train)
l_elas.fit(x_train,y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

In [10]:
evalue(lr,y_test,x_test)
evalue(l_ridge,y_test,x_test)
evalue(l_lasso,y_test,x_test)
evalue(l_elas,y_test,x_test)

mse:0.5291402345397405 	 mae:0.5328685121248196	 score:0.596596837481228	 r2_score:0.596596837481228

mse:0.5291416267227023 	 mae:0.53287857441135	 score:0.5965957761160681	 r2_score:0.5965957761160681

mse:0.6007141890466327 	 mae:0.5783835328131752	 score:0.5420306606582358	 r2_score:0.5420306606582358

mse:0.751378625967539 	 mae:0.6759151237526986	 score:0.42716789580749576	 r2_score:0.4271678958074957



## 参数调优

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
parameter={"alpha":[0.1,0.5,1.0,0.05],"max_iter":[100,500,1000]}
# 损失函数中的alpha选择 和最大迭代次数选择
clf=GridSearchCV(Lasso(),parameter,cv=5)

In [13]:
clf.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.5, 1.0, 0.05], 'max_iter': [100, 500, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
evalue(clf,y_test,x_test)

mse:0.5535382433525465 	 mae:0.5496426955000939	 score:0.5779964112202995	 r2_score:0.5779964112202995



In [15]:
# 自动调优后的最佳参数选择
print("best params:{}".format(clf.best_params_))

best params:{'alpha': 0.05, 'max_iter': 100}


# logistic 回归模型

## 导入新闻数据

In [16]:
data_logistic=fetch_20newsgroups_vectorized()
data_log_x=data_logistic.data
data_log_y=data_logistic.target
names=data_logistic.target_names

In [17]:
names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [18]:
show=pd.DataFrame(list(data_log_x))

In [19]:
data_log_x.shape

(11314, 130107)

In [20]:
#拆分训练测试集
x_train,x_test,y_train,y_test=train_test_split(data_log_x,data_log_y,test_size=0.2,random_state=1)

In [21]:
#定义模型
logist=LogisticRegression()

In [22]:
logist.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
evalue(logist,y_test,x_test)

mse:11.820150243040212 	 mae:1.2342023862129916	 score:0.7817057003977022	 r2_score:0.6188089405616757

