## 导入数据

In [1]:
#导入相关python库
#coding:utf-8 
import warnings
warnings.filterwarnings('ignore')#忽略一些警告

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# 读取训练集和测试集
train = pd.read_csv('train.csv')
train_len = len(train)
test = pd.read_csv('test.csv')


In [3]:
train.head()

Unnamed: 0,时间,小区名,小区房屋出租数量,楼层,总楼层,房屋面积,房屋朝向,居住状态,卧室数量,厅的数量,卫的数量,出租方式,区,位置,地铁线路,地铁站点,距离,装修情况,月租金
0,1,3072,0.128906,2,0.236364,0.008628,东南,,1,1,1,,11.0,118.0,2.0,40.0,0.764167,,5.602716
1,1,3152,0.132812,1,0.381818,0.017046,东,,1,0,0,,10.0,100.0,4.0,58.0,0.709167,,16.977929
2,1,5575,0.042969,0,0.290909,0.010593,东南,,2,1,2,,12.0,130.0,5.0,37.0,0.5725,,8.998302
3,1,3103,0.085938,2,0.581818,0.019199,南,,3,2,2,,7.0,90.0,2.0,63.0,0.658333,,5.602716
4,1,5182,0.214844,0,0.545455,0.010427,东北,,2,1,1,,3.0,31.0,,,,,7.300509


In [4]:
test.head()

Unnamed: 0,id,时间,小区名,小区房屋出租数量,楼层,总楼层,房屋面积,房屋朝向,居住状态,卧室数量,厅的数量,卫的数量,出租方式,区,位置,地铁线路,地铁站点,距离,装修情况
0,1,4,6011,0.382812,1,0.6,0.007117,东,3.0,2,1,1,1.0,10.0,5.0,,,,6.0
1,2,4,1697,0.152344,1,0.472727,0.007448,东,,2,1,1,,3.0,0.0,,,,
2,3,4,754,0.207031,2,0.709091,0.014068,东南,,3,2,2,,10.0,9.0,4.0,74.0,0.400833,
3,4,4,1285,0.011719,0,0.090909,0.008937,南,,2,1,1,,6.0,96.0,5.0,17.0,0.384167,
4,5,4,4984,0.035156,1,0.218182,0.008606,东南,,2,1,1,,6.0,61.0,3.0,114.0,0.598333,


In [5]:
# 合并训练集和测试集,去掉房价一列
all_data = pd.concat([train, test], axis = 0, ignore_index= True)
all_data.drop(labels = ["月租金"],axis = 1, inplace = True)
all_data.drop(labels = ["id"],axis = 1, inplace = True)


## 查看房价分布

# 查看训练集的房价分布，左图是原始房价分布，右图是将房价对数化之后的分布
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
g1 = sns.distplot(train['月租金'],hist = True,label='skewness:{:.2f}'.format(train['月租金'].skew()),ax = ax1)
g1.legend()
g1.set(xlabel = 'Price')
g2 = sns.distplot(np.log1p(train['月租金']),hist = True,label='skewness:{:.2f}'.format(np.log1p(train['月租金']).skew()),ax=ax2)
g2.legend()
g2.set(xlabel = 'log(Price+1)')
plt.show()


# 由于房价是有偏度的,将房价对数化
train['月租金'] = np.log1p(train['月租金']) 

# 将有偏的数值特征对数化
num_features_list = list(all_data.dtypes[all_data.dtypes != "object"].index)

for i in num_features_list:
    if all_data[i].dropna().skew() > 0.75:
        all_data[i] = np.log1p(all_data[i])

# 将类别数值转化为虚拟变量
all_data = pd.get_dummies(all_data)  

In [6]:
# 查看缺失值
all_data.isnull().sum()

位置              41
出租方式        223617
区               41
卧室数量             0
卫的数量             0
厅的数量             0
地铁站点        134546
地铁线路        134546
小区名              0
小区房屋出租数量      1023
居住状态        228197
总楼层              0
房屋朝向             0
房屋面积             0
时间               0
楼层               0
装修情况        230119
距离          134546
dtype: int64

In [6]:
from scipy.stats import mode

In [None]:
# 将缺失值用该列的均值填充
all_data['装修情况']=all_data['装修情况'].fillna(mode(all_data['装修情况']).mode[0])
all_data['居住状态']=all_data['居住状态'].fillna(mode(all_data['居住状态']).mode[0])
all_data['出租方式']=all_data['出租方式'].fillna(mode(all_data['出租方式']).mode[0])
all_data['地铁线路']=all_data['地铁线路'].fillna(mode(all_data['地铁线路']).mode[0])
all_data['地铁站点']=all_data['地铁站点'].fillna(mode(all_data['地铁站点']).mode[0])
all_data['距离']=all_data['距离'].fillna(all_data['距离'].mean())
all_data['小区房屋出租数量']=all_data['小区房屋出租数量'].fillna(all_data['小区房屋出租数量'].mean())
all_data['区']= all_data['区'].fillna(mode(all_data['区']).mode[0])
all_data['位置']= all_data['位置'].fillna(mode(all_data['位置']).mode[0])

In [9]:
ordinal_cols = ['区','地铁线路','卧室数量','厅的数量','卫的数量','房屋朝向']

for col in ordinal_cols:
    dummies = pd.get_dummies(all_data[col], drop_first=False)
    dummies = dummies.add_prefix("{}#".format(col))
    all_data.drop(col, axis=1, inplace=True)
    all_data = all_data.join(dummies)

In [10]:
all_data['房屋面积的平方'] = all_data['房屋面积'].apply(lambda x: x**2) 
all_data['房屋面积的立方'] = all_data['房屋面积'].apply(lambda x: x**3) 

In [11]:
print(all_data.isnull().sum()) 

位置                0
出租方式              0
地铁站点              0
小区名               0
小区房屋出租数量          0
居住状态              0
总楼层               0
房屋面积              0
时间                0
楼层                0
装修情况              0
距离                0
区#0.0             0
区#1.0             0
区#2.0             0
区#3.0             0
区#4.0             0
区#6.0             0
区#7.0             0
区#8.0             0
区#9.0             0
区#10.0            0
区#11.0            0
区#12.0            0
区#13.0            0
区#14.0            0
地铁线路#1.0          0
地铁线路#2.0          0
地铁线路#3.0          0
地铁线路#4.0          0
                 ..
房屋朝向#北 西          0
房屋朝向#南            0
房屋朝向#南 东          0
房屋朝向#南 东北         0
房屋朝向#南 东南         0
房屋朝向#南 北          0
房屋朝向#南 北 东北       0
房屋朝向#南 西          0
房屋朝向#南 西 北        0
房屋朝向#南 西北         0
房屋朝向#南 西南         0
房屋朝向#南 西南 北       0
房屋朝向#南 西南 西       0
房屋朝向#南 西南 西 西北    0
房屋朝向#西            0
房屋朝向#西 北          0
房屋朝向#西 西北         0
房屋朝向#西 西北 北       0
房屋朝向#西北           0


In [12]:
# 将测试集和训练集分开
X_train = all_data[:train_len]
X_test = all_data[train_len:]
Y_train = train['月租金']
print(all_data.shape)
print(train_len)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)

(252818, 128)
196539
(196539, 128)
(56279, 128)
(196539,)


In [1]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(21)

rf_reg = RandomForestRegressor(n_estimators=50)
rf_reg.fit(X_train, Y_train)

combine_lists = lambda item: [item[0], item[1]]
feature_importance = list(map(combine_lists, zip(X_train.columns, rf_reg.feature_importances_)))
feature_importance = pd.DataFrame(
    feature_importance, columns=['feature', 'importance']
).sort_values(by='importance', ascending=False)

  _nan_object_mask = _nan_object_array != _nan_object_array


ImportError: cannot import name '_joblib_parallel_args'

In [14]:
feature_importance.head(10)

Unnamed: 0,feature,importance
7,房屋面积,0.280284
126,房屋面积的平方,0.166738
6,总楼层,0.09288
3,小区名,0.053831
0,位置,0.052351
23,区#12.0,0.043882
4,小区房屋出租数量,0.042568
11,距离,0.035452
21,区#10.0,0.029187
2,地铁站点,0.023706


In [16]:
filter_feature = feature_importance[feature_importance['importance'] > 0.001]['feature'].tolist()

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def simple_linear_model(X_train, Y_train, X_test, input_feature):
    reg = LinearRegression()
    reg.fit(X_train.as_matrix(columns=input_feature), Y_train)
    #test_predictions=reg.predict(val_data.as_matrix(columns=[input_feature]))
    #print(test_predictions)
    Y_test = reg.predict(X_test.as_matrix(columns=[input_feature]))
    print(Y_test)
    print(Y_test.shape)
    result = pd.DataFrame({"id":test.id, "price":Y_test})
    result.to_csv('result.csv', index = False)
    return 0

simple_linear_model(X_train, Y_train, X_test, ['房屋面积','卫的数量','区','位置','装修情况','出租方式','卧室数量','厅的数量','地铁站点',
                                               '地铁线路','小区名','小区房屋出租数量','居住状态','总楼层','楼层','距离'])

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
np.random.seed(21)

def other_regression_model(X_train, Y_train, X_test, input_feature):
    reg=RandomForestRegressor(n_estimators=100)
    reg2= AdaBoostRegressor(DecisionTreeRegressor(),n_estimators=300)
    reg3=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
    reg.fit(X_train.as_matrix(columns=input_feature), Y_train)
    ##reg2.fit(X_train.as_matrix(columns=input_feature), Y_train)
    ##reg3.fit(X_train.as_matrix(columns=input_feature), Y_train)
    #test_predictions=reg.predict(test.as_matrix(columns=[input_feature]))
    #print(test_predictions)
   
    Y_test = reg.predict(X_test.as_matrix(columns=input_feature))
    ##Y_test1 = reg2.predict(X_test.as_matrix(columns=input_feature))
    ##Y_test2 = reg3.predict(X_test.as_matrix(columns=input_feature))
    print(Y_test)
    ##print(Y_test1)
    ##print(Y_test2)
    print(Y_test.shape)
    result = pd.DataFrame({"id":test.id, "price":Y_test})
    result.to_csv('result.csv', index = False)
    ##result1 = pd.DataFrame({"id":test.id, "price":Y_test1})
    ##result.to_csv('result1.csv', index = False)
    ##result2 = pd.DataFrame({"id":test.id, "price":Y_test2})
    ##result.to_csv('result2.csv', index = False)
    return 0
   
other_regression_model(X_train, Y_train, X_test, filter_feature)


[  4.23146576   5.57894737  12.25806452 ...,   7.68534239  11.68421053
   4.79864177]
(56279,)


0

In [9]:
from sklearn.linear_model import Ridge, LassoCV
from sklearn.model_selection import cross_val_score

# 定义交叉验证,用均方根误差来评价模型的拟合程度
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, Y_train, scoring = 'neg_mean_squared_error', cv=5))
    return rmse


In [10]:
# Ridge模型
model_ridge = Ridge()
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = a)).mean() for a in alphas]
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge
# 交叉验证可视化
fig = plt.figure(figsize=(8,5))
cv_ridge.plot(title = 'Cross Validation Score with Model Ridge')
plt.xlabel("alpha")
plt.ylabel("rmse")
plt.show()


SyntaxError: invalid syntax (<ipython-input-10-b98a8834cc69>, line 2)

In [None]:
# 当alpha为0时,均方根误差最小
cv_ridge.min()

In [11]:
# lasso模型,均方根误差的均值更小,因此最终选择lasso模型
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, Y_train)
rmse_cv(model_lasso).mean()

ValueError: could not convert string to float: '南'

In [None]:
# 查看模型系数, lasso模型能选择特征,将不重要的特征系数设置为0
coef = pd.Series(model_lasso.coef_, index = X_train.columns)
print("Lasso picked {} variables and eliminated the other {} variables".format(sum(coef != 0), sum(coef==0)))


In [12]:
# 查看重要的特征, GrLivArea地上面积是最重要的正相关特征
imp_coef = pd.concat([coef.sort_values().head(10),coef.sort_values().tail(10)])
fig = plt.figure(figsize=(6,8))
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")
plt.show()


NameError: name 'coef' is not defined

In [13]:
# 查看残差
est = pd.DataFrame({"est":model_lasso.predict(X_train), "true":Y_train})
plt.rcParams["figure.figsize"] = [6,6]
est["resi"] = est["true"] - est["est"]
est.plot(x = "est", y = "resi",kind = "scatter")
plt.show()


NameError: name 'model_lasso' is not defined

In [14]:
# 查看两种模型的预测结果, 将结果指数化
lasso_preds = np.expm1(model_lasso.predict(X_test))

predictions = pd.DataFrame({ "lasso":lasso_preds})


NameError: name 'model_lasso' is not defined

In [15]:

preds = lasso_preds
result = pd.DataFrame({"id":test.id, "price":preds})
result.to_csv('result.csv', index = False)


NameError: name 'lasso_preds' is not defined