In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
import pandas as pd
import time
import logging

In [2]:
data = pd.read_csv('data_all.csv')
y_data = pd.DataFrame(data['status'])
x_data = data.drop(columns='status')
y_data = y_data.values  #转化成矩阵
x_data = x_data.values
y_data = y_data.ravel() #转成1维数组

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=2018)
print(data.shape)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(4754, 85)
(3327, 84) (3327,) (1427, 84) (1427,)


# LR

In [40]:
from sklearn.linear_model import  LogisticRegression

t_start = time.time()

lr = LogisticRegression(penalty='l1', class_weight='balanced', max_iter=100, random_state=2018)
lr.fit(X=x_train, y=y_train)
y_valid = lr.predict_proba(x_test)
y_valid = pd.DataFrame(y_valid)
y_valid = y_valid[0]
roc_score = roc_auc_score(y_test, y_valid)
print(roc_score)
joblib.dump(lr, 'lr_model.m')

t_end = time.time()
print('LR训练结束,耗时:{}min'.format((t_end - t_start) / 60))

0.23474747790888129
LR训练结束,耗时:0.2152789831161499min


# SVM

In [4]:
from sklearn.svm import SVC

t_start = time.time()

svc = SVC(class_weight='balanced', probability=True, random_state=2018)
svc.fit(X=x_train, y=y_train)

y_valid = svc.predict_proba(x_test)
y_valid = pd.DataFrame(y_valid)
y_valid = y_valid[0]

roc_score = roc_auc_score(y_test, y_valid)
print(roc_score)
joblib.dump(svc, 'svc_model.m')

t_end = time.time()
print('SVM训练结束,耗时:{}min'.format((t_end - t_start) / 60))

0.5
SVM训练结束,耗时:0.6427367647488912min


# Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

t_start = time.time()

dtree = DecisionTreeClassifier(max_depth=30, min_samples_split=3, max_features='log2', random_state=2018, \
                               max_leaf_nodes=8, class_weight='balanced')
dtree.fit(X=x_train, y=y_train)

y_valid = dtree.predict_proba(X=x_test)
y_valid = pd.DataFrame(y_valid)
y_valid = y_valid[0]

roc_score = roc_auc_score(y_test, y_valid)
print(roc_score)
joblib.dump(dtree, 'dtree_model.m')
t_end = time.time()
print('Tree训练结束,耗时:{}min'.format((t_end - t_start) / 60))

0.30753341053488154
Tree训练结束,耗时:0.0006667017936706543min
