# AUC with Python
    1.几何意义
    2.概率意义--排序-》统计
    3.概率意义--直方图-》统计

# 数据准备

In [8]:
import random
import time
import pandas as pd
import numpy as np

In [9]:
def gen_label_pred(n_sample):
    """
    随机生成n个样本的标签和预测值
    """
    labels = [0 for _ in range(n_sample//2)] + [1 for _ in range(n_sample//2)]
    preds = [random.random()*0.7 for _ in range(n_sample//2)] + [(random.random() + 1)/2 for _ in range(n_sample//2)]


    return labels,preds


In [3]:
def timeit(func):
    """
    装饰器，计算函数执行时间
    """
    def wrapper(*args, **kwargs):
        time_start = time.time()
        result = func(*args, **kwargs)
        time_end = time.time()
        exec_time = time_end - time_start
        print("{function} exec time: {time}s".format(function=func.__name__,time=exec_time))
        return result
    return wrapper

# 法1 -- 面积法

In [10]:
@timeit
def area_auc(labels,preds):

    data=pd.DataFrame(index=range(0,n_sample),columns=('probability','label'))
    data['label']=np.array(labels)
    data['probability']=np.array(preds)
    
    #计算 TN/FP/TP/FN
    cm=np.arange(4).reshape(2,2)
    cm[0,0]=len(data[data['label']==0][data['probability']<0.5]) #TN
    cm[0,1]=len(data[data['label']==0][data['probability']>=0.5])#FP
    cm[1,0]=len(data[data['label']==1][data['probability']<0.5]) #FN
    cm[1,1]=len(data[data['label']==1][data['probability']>=0.5])#TP

    #计算TPR,FPR
    data.sort_values('probability',inplace=True,ascending=False)
    TPRandFPR=pd.DataFrame(index=range(len(data)),columns=('TPR','FPR'))
    for j in range(len(data)):
        #以每一个概率为分类的阈值，统计此时正例和反例的数量，
        data1=data.head(n=j+1)

        FPR=len(data1[data1['label']==0])/float(len(data[data['label']==0]))  #假正例的数量/反例的数量
        TPR=len(data1[data1['label']==1])/float(len(data[data['label']==1])) #真正例的数量/正例的数量

        TPRandFPR.iloc[j]=[TPR,FPR]

    # 计算AUC  ,计算小矩形的面积之和
    auc = 0.
    prev_x = 0
    for x,y in zip(TPRandFPR.FPR,TPRandFPR.TPR):
        if x != prev_x:
            auc += (x - prev_x) * y
            prev_x = x

    return auc

# 法二:概率意义--排序+统计

In [5]:
@timeit
def naive_auc(labels,preds):
    """
      先排序，然后统计有多少正负样本对满足：正样本预测值>负样本预测值, 再除以总的正负样本对个数
     复杂度 O(NlogN), N为样本数
    """
    n_pos = sum(labels)
    n_neg = len(labels) - n_pos
    total_pair = n_pos * n_neg
    
    labels_preds = zip(labels,preds)
    labels_preds = sorted(labels_preds,key=lambda x:x[1]) #对概率值排序
    accumulated_neg = 0 #统计负样本出现的个数
    satisfied_pair = 0  #统计满足条件的样本对的个数
    for i in range(len(labels_preds)):
        if labels_preds[i][0] == 1:
            satisfied_pair += accumulated_neg
        else:
            accumulated_neg += 1

    return satisfied_pair / float(total_pair)

# 法三 -- 概率意义--直方图+统计
![image.png](attachment:image.png)

In [6]:
@timeit
def approximate_auc(labels,preds,n_bins=100):
    """
    近似方法，将预测值分桶(n_bins)，对正负样本分别构建直方图，再统计满足条件的正负样本对
    复杂度 O(N)
    这种方法有什么缺点？怎么分桶？一般100-1000效果都不错
    
    """
    n_pos = sum(labels)
    n_neg = len(labels) - n_pos
    total_pair = n_pos * n_neg
    
    pos_histogram = [0 for _ in range(n_bins)]
    neg_histogram = [0 for _ in range(n_bins)]
    bin_width = 1.0 / n_bins
    for i in range(len(labels)):
        nth_bin = int(preds[i]/bin_width)
        if labels[i]==1:
            pos_histogram[nth_bin] += 1
        else:
            neg_histogram[nth_bin] += 1
    
    accumulated_neg = 0
    satisfied_pair = 0
    for i in range(n_bins):
        satisfied_pair += (pos_histogram[i]*accumulated_neg + pos_histogram[i]*neg_histogram[i]*0.5)
        accumulated_neg += neg_histogram[i]
    
    return satisfied_pair / float(total_pair)

# 测试

In [15]:
if __name__ == "__main__":
    n_sample = 10000
    labels,preds = gen_label_pred(n_sample)
    area_auc_rst = area_auc(labels,preds)
    naive_auc_rst = naive_auc(labels,preds)
    approximate_auc_rst = approximate_auc(labels,preds)
    print ("area auc result:%f"%area_auc_rst)
    print ("naive auc result:%f"%naive_auc_rst)
    print ("approximate auc result:%f"%approximate_auc_rst)

    """
    area_auc exec time: 41.72503924369812s
    naive_auc exec time: 0.00701904296875s
    approximate_auc exec time: 0.004010915756225586s
    area auc result:0.945862
    naive auc result:0.945862
    approximate auc result:0.945906
    """

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


area_auc exec time: 41.72503924369812s
naive_auc exec time: 0.00701904296875s
approximate_auc exec time: 0.004010915756225586s
area auc result:0.945862
naive auc result:0.945862
approximate auc result:0.945906
