### Урок 6. Задача look-alike

#### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один с https://archive.ics.uci.edu/ml/datasets.php)
### выбран https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 6 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [2]:
df = pd.read_csv('bank-full.csv', sep=';')
df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [3]:
df.shape

(45211, 17)

In [4]:
df['y'].value_counts()

no     39922
yes     5289
Name: y, dtype: int64

### сделать feature engineering

In [5]:
df['y'] = df['y'].replace({'yes': 1, 'no': 0})
df = pd.get_dummies(df)

In [6]:
df.head(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


### обучить любой классификатор (какой вам нравится)

In [7]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    
    return f1, roc, rec, prc

In [8]:
x_df = df.copy()
y_df = x_df.pop('y')

result = []

In [9]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.3, random_state=42)

In [10]:
%%time
model = xgb.XGBClassifier()

model.fit(X_train, y_train)
y_predict = model.predict(X_test)

CPU times: user 35.7 s, sys: 1.42 s, total: 37.2 s
Wall time: 2.54 s


In [11]:
result.append(['only_xgb', *evaluate_results(y_test, y_predict)])

Classification results:
f1: 55.67%
roc: 72.69%
recall: 49.00%
precision: 64.44%


### далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

### Представим, что нам неизвестны негативы и часть позитивов

In [12]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data['y'].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1323/5289 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [13]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data['class_test'].value_counts())

target variable:
 -1    43888
 1     1323
Name: class_test, dtype: int64


In [14]:
mod_data.head(10)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,class_test
0,58,2143,5,261,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
1,44,29,5,151,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
2,33,2,5,76,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,-1
4,33,1,5,198,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
5,35,231,5,139,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
6,28,447,5,217,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
7,42,2,5,380,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
8,58,121,5,50,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1
9,43,593,5,55,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,-1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [15]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original clas

### 1. random negative sampling

In [16]:
mod_data = mod_data.sample(frac=1)

data_N = mod_data[mod_data['class_test'] == -1] 
data_P = mod_data[mod_data['class_test'] == 1] 

neg_sample = data_N[:data_P.shape[0]]
sample_test = data_N[data_P.shape[0]:]
pos_sample = data_P.copy() 

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1323, 53) (1323, 53)


In [17]:
%%time
model = xgb.XGBClassifier()
sample_train.loc[sample_train['class_test']==-1, 'class_test'] = 0 # потому что XGBClassifier обучается на 0/1, а у нас -1/1

model.fit(sample_train.drop(columns=['y', 'class_test']).values, 
          sample_train['class_test'].values)
y_predict = model.predict(sample_test.drop(columns=['y', 'class_test']).values)
result.append(['random negative sampling', *evaluate_results(sample_test['y'].values, y_predict)])

Classification results:
f1: 48.31%
roc: 83.33%
recall: 82.40%
precision: 34.18%
CPU times: user 18.7 s, sys: 1.15 s, total: 19.8 s
Wall time: 1.4 s


In [18]:
pd.DataFrame(result, columns=['method', 'f1', 'roc', 'recall', 'precision'])

Unnamed: 0,method,f1,roc,recall,precision
0,only_xgb,0.556701,0.726943,0.489987,0.644444
1,random negative sampling,0.483132,0.833291,0.824004,0.341756


### Точность просела, но улучшился отбор неверных результатов