1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)



In [50]:
import pandas as pd
import numpy as np
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [51]:
def evaluate_results(y_test, y_predict):
    array = []
    array.append(f1_score(y_test, y_predict))
    array.append(roc_auc_score(y_test, y_predict))
    array.append(recall_score(y_test, y_predict, average='binary'))
    array.append(precision_score(y_test, y_predict, average='binary'))
    return array


In [52]:
# загружаем данные
data = pd.read_csv("datatraining.txt", header=0)
data

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.2720,426.0,721.250000,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.000000,0.004783,1
3,2015-02-04 17:53:00,23.15,27.2450,426.0,713.500000,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2000,426.0,708.250000,0.004772,1
5,2015-02-04 17:55:00,23.10,27.2000,426.0,704.500000,0.004757,1
...,...,...,...,...,...,...,...
8139,2015-02-10 09:29:00,21.05,36.0975,433.0,787.250000,0.005579,1
8140,2015-02-10 09:29:59,21.05,35.9950,433.0,789.500000,0.005563,1
8141,2015-02-10 09:30:59,21.10,36.0950,433.0,798.500000,0.005596,1
8142,2015-02-10 09:32:00,21.10,36.2600,433.0,820.333333,0.005621,1


In [53]:
data['Occupancy'].value_counts()

0    6414
1    1729
Name: Occupancy, dtype: int64

### AdaBoost классификатор

In [54]:
# делаем feature engineering
X = data[['Temperature','Humidity','Light','CO2','HumidityRatio']]
X = StandardScaler().fit_transform(X)
y = data['Occupancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [55]:
# обучаем AdaBoost классификатор
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)

In [56]:
score = []

In [57]:
score.append(['AdaBoostClassifier']+evaluate_results(y_test, y_predict))

### PU learning

In [58]:
mod_data = data.copy()
mod_data = mod_data.drop(columns='date')
#найдем индексы позитивных элементов
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#перемешаем их
np.random.shuffle(pos_ind)
# возьмем первые четверть позитивных индексов
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[1:pos_sample_len]

Using 433/1729 as positives and unlabeling the rest


In [59]:
# Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    7711
 1     432
Name: class_test, dtype: int64


In [60]:
mod_data

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,class_test
1,23.18,27.2720,426.0,721.250000,0.004793,1,1
2,23.15,27.2675,429.5,714.000000,0.004783,1,1
3,23.15,27.2450,426.0,713.500000,0.004779,1,1
4,23.15,27.2000,426.0,708.250000,0.004772,1,-1
5,23.10,27.2000,426.0,704.500000,0.004757,1,-1
...,...,...,...,...,...,...,...
8139,21.05,36.0975,433.0,787.250000,0.005579,1,-1
8140,21.05,35.9950,433.0,789.500000,0.005563,1,-1
8141,21.10,36.0950,433.0,798.500000,0.005596,1,1
8142,21.10,36.2600,433.0,820.333333,0.005621,1,-1


In [61]:
X_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [62]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(432, 7) (432, 7)


In [63]:
model = AdaBoostClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
score.append(['PU learning']+evaluate_results(sample_test.iloc[:,-2].values, y_predict))

In [64]:
mytable = PrettyTable()
mytable.align = "r"
values = ['fscore', 'roc auc', 'recall', 'precision']
for val in values:
    mytable.float_format[val] = ".5"
# имена полей таблицы
mytable.field_names = ['algoritm'] + values 
# добавление данных по одной строке за раз
for i in range(2):
    mytable.add_row([elem for elem in score[i]])
# вывод таблицы 
print(mytable)

+--------------------+---------+---------+---------+-----------+
|      algoritm      |  fscore | roc auc |  recall | precision |
+--------------------+---------+---------+---------+-----------+
| AdaBoostClassifier | 0.97110 | 0.98791 | 0.98824 |  0.95455  |
|    PU learning     | 0.95044 | 0.98676 | 0.99357 |  0.91090  |
+--------------------+---------+---------+---------+-----------+


### Поэкспериментируем с долей P 

In [67]:
score = []
for c in range(1,6):
    mod_data = data.copy()
    mod_data = mod_data.drop(columns='date')
    pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
    np.random.shuffle(pos_ind)
    pos_sample_len = int(np.ceil(c * 0.1 * len(pos_ind)))
    pos_sample = pos_ind[:pos_sample_len]
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
    X_data = mod_data.iloc[:,:-2].values # just the X 
    y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
    y_positive = mod_data.iloc[:,-2].values # original class
    mod_data = mod_data.sample(frac=1)
    neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
    sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
    pos_sample = mod_data[mod_data['class_test']==1]
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    model = AdaBoostClassifier()
    model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
    y_predict = model.predict(sample_test.iloc[:,:-2].values)
    score.append(['Доля P '+str(c*10)+ '%']+evaluate_results(sample_test.iloc[:,-2].values, y_predict))
    
mytable = PrettyTable()
mytable.align = "r"
values = ['fscore', 'roc auc', 'recall', 'precision']
for val in values:
    mytable.float_format[val] = ".5"
# имена полей таблицы
mytable.field_names = ['algoritm'] + values 
# добавление данных по одной строке за раз
for i in range(5):
    mytable.add_row([elem for elem in score[i]])
# вывод таблицы 
print(mytable)

+------------+---------+---------+---------+-----------+
|  algoritm  |  fscore | roc auc |  recall | precision |
+------------+---------+---------+---------+-----------+
| Доля P 10% | 0.96830 | 0.98962 | 0.99343 |  0.94441  |
| Доля P 20% | 0.96496 | 0.98751 | 0.98792 |  0.94304  |
| Доля P 30% | 0.96682 | 0.98821 | 0.98681 |  0.94764  |
| Доля P 40% | 0.96911 | 0.99085 | 0.99068 |  0.94846  |
| Доля P 50% | 0.95547 | 0.98645 | 0.98298 |  0.92946  |
+------------+---------+---------+---------+-----------+
