### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [76]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

data = pd.read_csv("../churn_data.csv")
data.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


Посмотрим на соотношение классов

In [77]:
data['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

Разбиваем выборку на тренировочную и тестовую части и обучаем модель (в примере - градиентный бустинг)

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Exited']), data['Exited'], test_size=0.2, random_state=7)

In [79]:
X_train.shape

(8000, 13)

К полям:
- Surname, Geography, Gender,применим OHE-кодирование
- остальные - оставим пока, как есть

In [80]:
from sklearn.pipeline import FeatureUnion

In [81]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


from sklearn.preprocessing import StandardScaler

# Создадим списки признаков
category_feat = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuos_feat = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

# под каждый признак создать трансформер и объединить их в список.
transformers = []

for cat_col in category_feat:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    transformers.append((cat_col, cat_transformer))
    
for cont_col in continuos_feat:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    transformers.append((cont_col, cont_transformer))

In [82]:
#Соберём пайплайн
feats = FeatureUnion(transformers)

In [83]:
import xgboost as xgb

#model = xgb.XGBClassifier()

pipeline = Pipeline([
    ('features', feats),
    ('clf', xgb.XGBClassifier(random_state=7))
])

In [84]:
# обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
                   

In [85]:
# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.7795174 , 0.0100481 , 0.01208574, 0.7892142 , 0.26004347,
       0.00257743, 0.05695235, 0.00336045, 0.00999833, 0.9720158 ],
      dtype=float32)

Проверяем качество

In [86]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.3238639831542969, F-Score=0.630, Precision=0.635, Recall=0.625


In [87]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.8548659503674134

In [88]:
d = {'thresh': thresholds[ix], 'F-Score': fscore[ix],
     'Precision': precision[ix], 'Recall': recall[ix],'ROC AUC': roc_auc}

pd.DataFrame(data=d, index=[1], columns=['thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])

Unnamed: 0,thresh,F-Score,Precision,Recall,ROC AUC
1,0.323864,0.629902,0.634568,0.625304,0.854866


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [89]:
mod_data = X_train.copy()
mod_data['Exited'] = y_train
mod_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4989,4990,15602851,Ozioma,629,France,Male,40,9,0.0,1,1,0,106.67,0
4498,4499,15777759,Boucaut,570,France,Male,30,2,131406.56,1,1,1,47952.45,0
8876,8877,15712807,Robertson,556,Spain,Male,46,3,131764.96,1,1,1,108500.66,1
670,671,15576368,Bledsoe,624,Germany,Female,48,3,122388.38,2,0,0,30020.09,0
9552,9553,15772009,Scott,664,France,Female,41,5,0.0,1,1,1,152054.33,0


In [90]:
# get the indices of the positives samples
pos_ind = mod_data[mod_data['Exited'] == 1].sample(frac=1, random_state=42).index

# leave just 25% of the positives marked
perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 407/1626 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [91]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    7593
 1     407
Name: class_test, dtype: int64


* We now have just 153 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col 4 still holds the actual label

In [92]:
mod_data.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,class_test
4989,4990,15602851,Ozioma,629,France,Male,40,9,0.0,1,1,0,106.67,0,-1
4498,4499,15777759,Boucaut,570,France,Male,30,2,131406.56,1,1,1,47952.45,0,-1
8876,8877,15712807,Robertson,556,Spain,Male,46,3,131764.96,1,1,1,108500.66,1,-1
670,671,15576368,Bledsoe,624,Germany,Female,48,3,122388.38,2,0,0,30020.09,0,-1
9552,9553,15772009,Scott,664,France,Female,41,5,0.0,1,1,1,152054.33,0,-1
1757,1758,15704763,Kozlova,523,Germany,Female,39,1,143903.11,1,1,1,118711.75,1,-1
1781,1782,15771636,Marshall,793,Spain,Female,36,0,0.0,1,0,0,148993.47,0,-1
5270,5271,15607230,Michel,588,Germany,Male,33,9,150186.22,2,1,1,65611.01,0,-1
8130,8131,15588928,Maslow,704,France,Male,47,5,0.0,2,1,1,145338.61,0,-1
4426,4427,15749557,Chao,707,France,Female,44,6,0.0,2,1,1,192542.17,0,-1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

### 1. random negative sampling

In [93]:
mod_data = mod_data.sample(frac=1, random_state=42)
data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]
neg_sample = data_N[:data_P.shape[0]]
pos_sample = data_P.copy()
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=42)

(407, 15) (407, 15)


In [94]:
sample_train

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,class_test
6830,6831,15669262,Maslov,765,France,Male,43,9,157960.49,2,0,0,136602.80,0,-1
9044,9045,15653347,Chiu,560,Spain,Male,47,1,0.00,1,0,0,128882.66,1,1
2013,2014,15742238,Dellucci,705,Germany,Male,35,4,136496.12,2,1,0,116672.02,0,-1
9375,9376,15641389,Shen,659,Germany,Male,48,4,123593.22,2,1,0,82469.06,1,-1
7116,7117,15788776,Landor,588,Germany,Male,49,6,132623.76,3,1,0,36292.94,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6399,6400,15738501,Booth,601,Germany,Male,48,9,163630.76,1,0,1,41816.49,1,-1
5064,5065,15671243,Y?,558,France,Female,47,9,0.00,2,1,0,103787.28,0,-1
704,705,15808621,Mordvinova,659,Germany,Male,36,2,76190.48,2,1,1,149066.14,0,-1
8095,8096,15584620,Su,850,Germany,Female,36,6,143644.16,1,1,0,22102.25,1,1


In [95]:
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

pipeline = Pipeline([
    ('features', feats),
    ('clf', xgb.XGBClassifier(random_state=7)),
])


pipeline.fit(sample_train.drop(columns=['class_test', 'Exited']), 
             sample_train['class_test'])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
                   

In [96]:
# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.8953136 , 0.756077  , 0.19625962, 0.67003256, 0.3904954 ,
       0.00332656, 0.00158593, 0.04592693, 0.00721614, 0.99585354],
      dtype=float32)

In [97]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.6265799403190613, F-Score=0.532, Precision=0.457, Recall=0.635


In [98]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.7779242633739563

In [99]:
d = {'thresh': thresholds[ix], 'F-Score': fscore[ix],
     'Precision': precision[ix], 'Recall': recall[ix],'ROC AUC': roc_auc}

pd.DataFrame(data=d, index=[1], columns=['thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])

Unnamed: 0,thresh,F-Score,Precision,Recall,ROC AUC
1,0.62658,0.531568,0.457093,0.635036,0.777924
