## Урок 6. #Задача lookalike (Positive Unlabeled Learning)

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)


### Ссылка на источник данных - https://www.kaggle.com/ronitf/heart-disease-uci

#### Attribute Information

* age
* sex
* chest pain type (4 values)
* resting blood pressure
* serum cholestoral in mg/dl
* fasting blood sugar > 120 mg/dl
* resting electrocardiographic results (values 0,1,2)
* maximum heart rate achieved
* exercise induced angina
* oldpeak = ST depression induced by exercise relative to rest
* the slope of the peak exercise ST segment
* number of major vessels (0-3) colored by flourosopy
* thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0))

    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0))

    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 

    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0))

    return f1, roc, rec, prc

In [3]:
df = pd.read_csv("data/web6/heart.csv")
df.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1


In [4]:
print(df.shape)

(303, 14)


## Соотношение классов

In [5]:
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
X = df.drop('target', axis=1)
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

In [7]:
model = xgb.XGBClassifier(eval_metric='error')
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

f1, roc, rec, prc = evaluate_results(y_test, y_predict)

results = dict()
results['XGB'] = {
    'roc_auc': roc,
    'f1': f1,
    'precision': prc,
    'recall': rec,
}

Classification results:
f1: 81.97%
roc: 82.17%
recall: 78.12%
precision: 86.21%


## PU learning

In [8]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data['target'].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 42/165 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [9]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data['class_test'].value_counts())

target variable:
 -1    261
 1     42
Name: class_test, dtype: int64


In [10]:
mod_data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,class_test
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,-1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,-1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,-1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,-1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,-1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1,-1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1,-1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1,-1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1,1


In [11]:
x_data = mod_data.drop(['target', 'class_test'], axis=1).values # just the X 
y_labeled = mod_data['class_test'].values # new class (just the P & U)
y_positive = mod_data['target'].values # original class

## Random negative sampling

In [12]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(42, 15) (42, 15)


In [13]:
model = xgb.XGBClassifier(eval_metric='error')
model.fit(
    sample_train.drop(['target', 'class_test'], axis=1).values,
    sample_train['target'].values
)
y_predict = model.predict(sample_test.drop(['target', 'class_test'], axis=1).values)

f1, roc, rec, prc = evaluate_results(sample_test['target'].values, y_predict)
results['XGB_RNS'] = {
    'roc_auc': roc,
    'f1': f1,
    'precision': prc,
    'recall': rec,
}

Classification results:
f1: 79.83%
roc: 79.06%
recall: 89.42%
precision: 72.09%


In [14]:
pd.DataFrame.from_dict(results, orient='index').sort_values(by='roc_auc', ascending=False)

Unnamed: 0,roc_auc,f1,precision,recall
XGB,0.821659,0.819672,0.862069,0.78125
XGB_RNS,0.790594,0.798283,0.72093,0.894231


<b>Бонусный вопрос:</b>

Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?

Ваш ответ здесь:

2-step approach, так как позволяет получить более стабильную выборку, нежели при RNS (здесь на удачу).