### LookAlike

1. набор данных для бинарной классификации
2. проведение feature engineering
3. обучение классификатора
4. разделение набора данных на два множества: P (positives) и U (unlabeled). В выборке P будут не все положительные примеры, а их часть
5. random negative sampling для построения классификатора в новых условиях и сравнение с классификацией из пункта 3
6. нахождение лучшего качества модели при уменьшении/увеличении размера Positive-выборки

In [None]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier




### Загрузка данных

Датасет содержит анонимные данные о 395 студентах двух средних школ в Португалии.

Таргет: по умолчанию в соревновании на kaggle это успеваемость по математике или португальскому (G1-G3).

У меня: *paid*, дополнительные курсы по предмету (в моем случае только по математике).

In [None]:
data = pd.read_csv('/content/Maths.csv')

In [None]:
info = pd.read_excel('/content/info.xlsx')
info
# G3 = final math grade (numeric: from 0 to 20, output target)

Unnamed: 0,Columns,Description
0,school,student's school (binary: 'GP' - Gabriel Perei...
1,sex,student's sex (binary: 'F' - female or 'M' - m...
2,age,student's age (numeric: from 15 to 22)
3,address,student's home address type (binary: 'U' - urb...
4,famsize,family size (binary: 'LE3' - less or equal to ...
5,Pstatus,parent's cohabitation status (binary: 'T' - li...
6,Medu,"mother's education (numeric: 0 - none, 1 - pri..."
7,Fedu,"father's education (numeric: 0 - none, 1 - pri..."
8,Mjob,"mother's job (nominal: 'teacher', 'health' car..."
9,Fjob,"father's job (nominal: 'teacher', 'health' car..."


In [None]:
target = 'paid'
#target = 'romantic'   # ===> fscore не больше 0.49

data[target] = data[target].replace({'yes': 1, 'no': 0})

data[target].value_counts(normalize=True)

0    0.541772
1    0.458228
Name: paid, dtype: float64

In [None]:
data.drop(target, axis=1).corrwith(data[target])*100

age           -3.593287
Medu          15.970038
Fedu           8.698142
traveltime    -6.642024
studytime     16.721988
failures     -18.803897
famrel         0.045965
freetime      -6.425287
goout          1.049327
Dalc           6.246536
Walc           6.045364
health        -7.813240
absences       0.743517
G1             3.907932
G2            10.519841
G3            10.199624
dtype: float64

In [None]:
cat_features = data.select_dtypes('object').columns.to_list()

num_features = data.drop(target, axis=1).select_dtypes(['int64', 'float64']).columns.to_list()

print(cat_features)

['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher', 'internet', 'romantic']


### feature engineering

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


In [None]:
cat_feats = Pipeline(steps=[
                ('selector', ColumnSelector(key=cat_features)),
                ('ohe', OHEEncoder(key=cat_features))
            ])

In [None]:
num_feats = Pipeline(steps=[
                ('selector', ColumnSelector(key=num_features)),
                ('standard', StandardScaler())
            ])

In [None]:
feats = FeatureUnion([('numeric_features', num_feats),
                      ('cat_features', cat_feats)
                      ])

### Обучение классификатора

In [None]:
y = data[target]
X = data.drop(target, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=197)

In [None]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state = 197)),
])

In [None]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key=['age',
                                                                                      'Medu',
                                                                                      'Fedu',
                                                                                      'traveltime',
                                                                                      'studytime',
                                                                                      'failures',
                                                                                      'famrel',
                                                                                      'freetime',
                                                    

In [None]:
# прогноз
pipe_preds = pipeline.predict_proba(X_test)[:, 1]

In [None]:
pipe_preds[:10]

array([0.78, 0.53, 0.23, 0.57, 0.29, 0.1 , 0.28, 0.53, 0.67, 0.65])

In [None]:
def show_metrics(y_test, preds):

  precision, recall, thresholds = precision_recall_curve(y_test, preds)
  fscore = (2*precision * recall) / (precision + recall)
  ix = np.argmax(fscore)
    
  print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], fscore[ix],
                                                                          precision[ix], recall[ix]))
  

print(show_metrics(y_test, pipe_preds))

Best Threshold=0.420000, F-Score=0.767, Precision=0.708, Recall=0.836
None


Посмотрим, как PU-Learning повлияет на результаты.

### Random Negative Sampling

In [None]:
data[target].value_counts(normalize=True)

0    0.541772
1    0.458228
Name: paid, dtype: float64

In [None]:
def RNS(data, POS_FRAC=0.5):

  print(f'When using {POS_FRAC*100}% of positive samples')

  mod_data = data.copy()

  # индексы положительных объектов
  pos_ind = np.where(mod_data.loc[:, target].values == 1)[0]

  np.random.shuffle(pos_ind)

  pos_sample_len = int(np.ceil(POS_FRAC * len(pos_ind)))

  #print(f'Using only {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
  pos_sample = pos_ind[:pos_sample_len]

  # теперь разделение mod_data

  mod_data['testing_target'] = -1  # U (unlabeled)

  mod_data.loc[pos_sample, 'testing_target'] = 1 # P (positive)

  #print('target variable:\n', mod_data.loc[:, 'testing_target'].value_counts())

  x_data = mod_data.drop([target, 'testing_target'], axis=1).values

  y_labeled = mod_data.loc[:, 'testing_target'].values

  y_positive = mod_data.loc[:, target].values

  positive_samples = len(mod_data[mod_data['testing_target']==1])
  #print(f'Positive samples: {positive_samples}')

  mod_data = mod_data.sample(frac=1)

  # размер выборок должен совпадать
  neg_sample = mod_data[mod_data['testing_target']==-1][:positive_samples]
  sample_test = mod_data[mod_data['testing_target']==-1][positive_samples:]
  pos_sample = mod_data[mod_data['testing_target']==1]

  #print(neg_sample.shape, pos_sample.shape)

  sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

  pipeline.fit(sample_train.drop([target, 'testing_target'], axis=1), sample_train.loc[:, target])

  modified_preds = pipeline.predict(sample_test.drop([target, 'testing_target'], axis=1))

  show_metrics(sample_test.loc[:, target].values, modified_preds)

In [None]:
for frac in np.linspace(0.1,0.9,9):
  RNS(data, POS_FRAC=round(frac, 1))
  print('\n')

When using 10.0% of positive samples
Best Threshold=1.000000, F-Score=0.626, Precision=0.488, Recall=0.873


When using 20.0% of positive samples
Best Threshold=0.000000, F-Score=0.564, Precision=0.393, Recall=1.000


When using 30.0% of positive samples
Best Threshold=1.000000, F-Score=0.559, Precision=0.428, Recall=0.806


When using 40.0% of positive samples
Best Threshold=1.000000, F-Score=0.576, Precision=0.419, Recall=0.920


When using 50.0% of positive samples
Best Threshold=1.000000, F-Score=0.557, Precision=0.427, Recall=0.800


When using 60.0% of positive samples
Best Threshold=1.000000, F-Score=0.545, Precision=0.408, Recall=0.824


When using 70.0% of positive samples
Best Threshold=1.000000, F-Score=0.444, Precision=0.295, Recall=0.897


When using 80.0% of positive samples
Best Threshold=1.000000, F-Score=0.423, Precision=0.278, Recall=0.882


When using 90.0% of positive samples
Best Threshold=1.000000, F-Score=0.312, Precision=0.200, Recall=0.714




In [None]:
RNS(data, POS_FRAC=0.05)

When using 5.0% of positive samples
Best Threshold=1.000000, F-Score=0.661, Precision=0.505, Recall=0.958


Как видно, если поместить часть положительных объектов в выборку Unlabeled, качество теряется. Точность модели ниже, а recall остается высокой. Это был только пример.

На деле, PU-Learning пригождается в кейсах, где большая часть объектов неразмеченна, но в этой большей выборке могут быть потенциально объекты таргетного класса (к примеру, в конверсии на сайте, где клиент, осуществляющий действие, это таргет).