In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',18)

1.读入数据

In [None]:
#读入训练集数据
train=pd.read_csv('case2_training.csv')
train=train.copy()
test = pd.read_csv("case2_testing.csv")
test=test.copy()
train.head()

2.描述性统计分析

In [None]:
train.shape, test.shape


In [None]:
train.columns, test.columns

In [None]:
train.dtypes

In [None]:
train.info()

In [None]:
train['Accept'].value_counts(normalize=True)

In [None]:
train['Accept'].value_counts().plot.bar()

3.Independent Variable (Categorical)

In [None]:
# Visualizing categorical features
# plt.figure(1)
plt.subplot(231)
train['Region'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Region')

plt.subplot(232)
train['Weekday'].value_counts(normalize=True).plot.bar(title= 'Weekday')

plt.subplot(233)
train['Apartment'].value_counts(normalize=True).plot.bar(title= 'Apartment')

plt.subplot(234)
train['Beds'].value_counts(normalize=True).plot.bar(title= 'Beds')

plt.show()

4.Independent Variable (Numerical)

In [None]:
plt.subplot(121)
sns.distplot(train['Review']);

plt.subplot(122)
train['Review'].plot.box(figsize=(16,5))

plt.show()

In [None]:
a = np.power(0.75,train['Pic Quality'])
sns.distplot(a);

log变换

In [None]:
# log transformation
train['Review_log'] = np.log(train['Review'])
test['Review_log'] = np.log(test['Review'])

In [None]:
fig = plt.figure(figsize=(14, 4))
ax1 = plt.subplot(121)
sns.distplot(train['Review_log'])
ax1.set_title("Train")

ax1 = plt.subplot(122)
sns.distplot(test['Review_log'])
ax1.set_title("Test")

In [None]:
from scipy import stats
train['Review_box'],lambda_=stats.boxcox(train['Review'])
fig=plt.figure(figsize=(15,5))
#pic1
plt.subplot(1,2,1)
sns.distplot(train['Review'])

In [None]:
plt.subplot(121)
sns.distplot(train['Pic Quality']);

plt.subplot(122)
train['Pic Quality'].plot.box(figsize=(16,5))

plt.show()

In [None]:
from scipy import stats
train['Pic Quality'],lambda_=stats.boxcox(train['Pic Quality'])
fig=plt.figure(figsize=(15,5))
#pic1
plt.subplot(1,2,1)
sns.distplot(train['Pic Quality'])

In [None]:
train.head()

In [None]:
plt.subplot(121)
sns.distplot(train['Price']);

plt.subplot(122)
train['Price'].plot.box(figsize=(16,5))

plt.show()

box-cox变换

In [None]:
from scipy.stats import boxcox
import matplotlib.pyplot as plt

标准化'Price'变量

In [None]:
#标准化'Price'变量
# train['normalized_price'] = (train['Price'] - train['Price'].mean()) / (train['Price'].std())
# del train['Price']
plt.subplot(121)
sns.distplot(train['normalized_price']);
plt.subplot(122)
train['normalized_price'].plot.box(figsize=(16,5))
plt.show()

特征离散化

In [None]:

#删去‘ID’变量
del train['ID']
#把日期（365天）归类转化为季度（1,2,3,4季度）
def assign_label(h):
    if h >=0 and h < 92:
        return 1
    elif h >= 92 and h <183:
        return 2
    elif h >=183 and h <275 :
        return 3
    elif h >= 275 and h < 366:
        return 4

train["date_label"] = train["Date"].apply(assign_label)
del train['Date']

#将周日到周四归为类别1，周五和周六归为类别2
def assign_week(h):
    if h >=0 and h < 5 or h>=7:
        return 1
    elif h >= 5 and h <7:
        return 2

train["week_label"] = train["Weekday"].apply(assign_week)
del train['Weekday']
#将原训练集里代表类别的特征‘Region’虚拟化处理
dum=pd.get_dummies(train['Region'],prefix="Region")
train=pd.concat([train,dum],axis=1)
del train['Region']

#转换后的transform_train留作给不同的算法使用
transform_train=train

#显示特征转换后的前五行
print(train.head())


In [None]:
print(transform_train.head())

用逻辑回归算法来训练模型

In [None]:
X = train.drop('Accept', 1)
y = train.Accept
# adding dummies to the dataset
X = pd.get_dummies(X)
train = pd.get_dummies(train)
test = pd.get_dummies(test)
print(X.shape, train.shape, test.shape)
print(X.head())

from sklearn.model_selection import train_test_split
# split the data into train and cross validation set
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3, random_state=0)


# take a look at the dimension of the data
print(x_train.shape, x_cv.shape, y_train.shape, y_cv.shape)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 拟合模型
penalty = {
    0: 1,
    1: 2.3
}
model = LogisticRegression(class_weight=penalty)
model.fit(x_train, y_train)
# 预测
pred_cv = model.predict(x_cv)
print('logistic预测准确率：',accuracy_score(y_cv, pred_cv))

#输出混淆矩阵
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_cv, pred_cv)
print(cm)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

from sklearn.metrics import classification_report
print(classification_report(y_cv, pred_cv))

 k 折交叉验证的逻辑回归

In [None]:
from sklearn.model_selection import StratifiedKFold
#k=5，且对数据的每个分层进行shuffle

mean_accuracy = []
i = 1
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

for train_index, test_index in kf.split(X, y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y[train_index], y[test_index]

    model1 = LogisticRegression(random_state=1)
    model1.fit(xtr, ytr)
    pred_test = model1.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    mean_accuracy.append(score)
    print('accuracy_score', score)
    i+=1
print("\n LR Mean validation accuracy: ", sum(mean_accuracy)/len(mean_accuracy))

使用决策树算法来训练训练集

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict

train=transform_train

#设置惩罚因子，我们加重惩罚错误预测为1的
penalty = {
    0: 1,
    1: 2.3
}
mean_accuracy = []
i=1
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)
for train_index,test_index in kf.split(X,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    model2 = DecisionTreeClassifier(random_state=1,class_weight=penalty)
    model2.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl,pred_test)
    mean_accuracy.append(score)
    print('accuracy_score',score)
    i+=1
    
print("\nMean validation accuracy: ", sum(mean_accuracy)/len(mean_accuracy))

catboost

In [None]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
mean_accuracy = []
i=1
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)
for train_index,test_index in kf.split(X,y):
      print('\n{} of kfold {}'.format(i,kf.n_splits))
      xtr,xvl = X.loc[train_index],X.loc[test_index]
      ytr,yvl = y[train_index],y[test_index]

      model = CatBoostClassifier(learning_rate=0.03)
      model.fit(xtr, ytr)
      pred_test = model.predict(xvl)
      score = accuracy_score(yvl,pred_test)
      mean_accuracy.append(score)
      print('accuracy_score',score)
      i+=1

print("\n CatBoost Mean validation accuracy: ", sum(mean_accuracy)/len(mean_accuracy))