In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix, f1_score, recall_score, precision_score, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

https://archive.ics.uci.edu/ml/datasets/Spambase

In [3]:
df = pd.read_csv('./spambase.data', header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [5]:
df[57].value_counts()

0    2788
1    1813
Name: 57, dtype: int64

In [6]:
def perform_cross_validation(classifier, df, target, cv=16, scoring="roc_auc", beta=1, rfe=None):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(target, 1),
                                                        df[target], random_state=43)
    
    if rfe:
        X_train = select.transform(X_train)
        X_test = select.transform(X_test)
        
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=cv, scoring=scoring)
    #запустим кросс-валидацию

    cv_score = np.mean(cv_scores)
    cv_score_std = np.std(cv_scores)
    print('CV score is {}+-{}'.format(cv_score, cv_score_std))

    #обучим пайплайн на всем тренировочном датасете
    classifier.fit(X_train, y_train)
    y_score = classifier.predict_proba(X_test)[:, 1]
    
    
    precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
    fscore = (1+beta**2)*(precision * recall) / (beta**2*precision + recall)
    ix = np.argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
    return cv_score, fscore[ix], precision[ix], recall[ix]

In [7]:
classifier = CatBoostClassifier(verbose = False)

In [9]:
result = perform_cross_validation(classifier, df, 57)

CV score is 0.9861585912123351+-0.00683633942871823
Best Threshold=0.367950, F-Score=0.954, Precision=0.943, Recall=0.966


Попытаемяся выянснить признаки наиболее влияющие на классификацию

In [52]:
select = RFE(RandomForestClassifier(n_estimators=50, random_state=42), n_features_to_select=45)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=[57]), df[57], random_state=42)
select.fit(X_train, y_train)

RFE(estimator=RandomForestClassifier(n_estimators=50, random_state=42),
    n_features_to_select=45)

In [54]:
result_rfe = perform_cross_validation(classifier, df, 57, rfe=select)

CV score is 0.9861864792036407+-0.006752147617035473
Best Threshold=0.367377, F-Score=0.956, Precision=0.942, Recall=0.970


Наш feature engineering дал небольшой положительный результат

### Далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [55]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

In [56]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 454/1813 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [57]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    4147
 1     454
Name: class_test, dtype: int64


In [58]:
x_data = mod_data.iloc[:,:-2].values # just the X 
x_data = select.transform(x_data)
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 1. random negative sampling

In [59]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(454, 59) (454, 59)


In [60]:
classifier.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)

y_predict = classifier.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 89.61%
roc: 93.45%
recall: 95.25%
precision: 84.60%


##### Поставим долю P в 10%

In [71]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 10% of the positives marked
pos_sample_len = int(np.ceil(0.1 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 182/1813 as positives and unlabeling the rest


In [72]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    4419
 1     182
Name: class_test, dtype: int64


In [73]:
x_data = mod_data.iloc[:,:-2].values # just the X 
x_data = select.transform(x_data)
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [74]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(182, 59) (182, 59)


In [75]:
classifier.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)

y_predict = classifier.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 87.20%
roc: 90.90%
recall: 95.71%
precision: 80.09%


##### Поставим долю P в 50%

In [76]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 50% of the positives marked
pos_sample_len = int(np.ceil(0.5 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 907/1813 as positives and unlabeling the rest


In [77]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    3694
 1     907
Name: class_test, dtype: int64


In [78]:
x_data = mod_data.iloc[:,:-2].values # just the X 
x_data = select.transform(x_data)
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [79]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(907, 59) (907, 59)


In [80]:
classifier.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)

y_predict = classifier.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 89.85%
roc: 95.05%
recall: 95.98%
precision: 84.45%


В случае unlabled результаты классификации хуже, чем на полностью размеченных данных. При этом они тем уже, чем меньшая доля P используется