1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

---

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'Times New Roman'

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

In [4]:
from xgboost import XGBClassifier

----

## **1987 National Indonesia Contraceptive Prevalence Survey**

data: https://archive.ics.uci.edu/ml/datasets/Contraceptive+Method+Choice

### Attribute Information:

N|name|type|decription|
:-|:------------------------------|:---------------------:|:---------------------|
1| Wife's age|                     (numerical)|-|
   2| Wife's education|               (categorical) |     1=low, 2, 3, 4=high|
   3| Husband's education|            (categorical) |     1=low, 2, 3, 4=high|
   4| Number of children ever born|   (numerical)|-|
   5| Wife's religion|                (binary)|           0=Non-Islam, 1=Islam|
   6| Wife's now working?|            (binary)|           0=Yes, 1=No|
   7| Husband's occupation|           (categorical)|      1, 2, 3, 4|
   8| Standard-of-living index|       (categorical)|      1=low, 2, 3, 4=high|
   9| Media exposure|                 (binary)|           0=Good, 1=Not good|
   10| Contraceptive method used|     (class attribute)|  1=No-use, 2=Long-term, 3=Short-term|

In [5]:
data_df = pd.read_csv('data/data.csv', sep=';')
data_df.head(3)

Unnamed: 0,age_wife,education_wife,education_husband,children_num,religion_wife,is_wife_working,occupation_husband,living _index,media_exposure,contraceptive_method
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1


In [6]:
data_df.describe(percentiles=[.5])

Unnamed: 0,age_wife,education_wife,education_husband,children_num,religion_wife,is_wife_working,occupation_husband,living _index,media_exposure,contraceptive_method
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.54,2.96,3.43,3.26,0.85,0.75,2.14,3.13,0.07,1.92
std,8.23,1.01,0.82,2.36,0.36,0.43,0.86,0.98,0.26,0.88
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,2.0,3.0,0.0,2.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,4.0,1.0,3.0


In [7]:
# выбросов, пропусков нет, признаков мало - с фичами особо не разойдешься
class FeatureEng:

    def fit_transform(self, data):

        data['sum_education'] = data['education_wife'] + data['education_wife']
        data['age_children'] = data['children_num'] / data['age_wife']
        data['pos'] = data['is_wife_working'] + data['media_exposure']
        data['social'] = data['sum_education'] + data['living _index']
        data['edu_job'] = data['sum_education'] + data['occupation_husband']
        # можно было бы закодировать категориальные признаки через target, но для catboost это не "must have"
        return data

In [8]:
feat = FeatureEng()
df = feat.fit_transform(data_df)
df.head(3)

Unnamed: 0,age_wife,education_wife,education_husband,children_num,religion_wife,is_wife_working,occupation_husband,living _index,media_exposure,contraceptive_method,sum_education,age_children,pos,social,edu_job
0,24,2,3,3,1,1,2,3,0,1,4,0.12,1,7,6
1,45,1,3,10,1,1,3,4,0,1,2,0.22,1,6,5
2,43,2,3,7,1,1,3,4,0,1,4,0.16,1,8,7


In [9]:
df['contraceptive_method'].value_counts()

1    629
3    511
2    333
Name: contraceptive_method, dtype: int64

Сведем задачу к **двухклассовой классификации**. В качестве положительного класса выбрем класс 1 . Остальных отнесем к классу 0

In [10]:
df['target'] = 0
df.loc[df['contraceptive_method'] == 1, 'target'] = 1
df.drop(columns='contraceptive_method', inplace=True)

In [11]:
cat_features = ['education_wife', 'education_husband',
                'occupation_husband', 'living _index']
df[cat_features] = df[cat_features].astype('category')
df = pd.get_dummies(df)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns='target'), df['target'], test_size=0.2, random_state=13)

In [13]:
model_xgb = XGBClassifier(random_state=13)
model_xgb.fit(X_train, y_train)
y_proba = model_xgb.predict_proba(X_test)[:, 1]
y_pred = model_xgb.predict(X_test)



In [14]:
result = {'metrics': ['roc_auc', 'f1score', 'precision', 'recall']}


def metrics(y_test, y_pred, y_proba):
    return [roc_auc_score(y_test, y_proba),
            f1_score(y_test, y_pred),
            precision_score(y_test, y_pred),
            recall_score(y_test, y_pred)]


result['XGBClassifier_base'] = metrics(y_test, y_pred, y_proba)

----

## PU-learning. (random negative sampling)

In [15]:
for k in [10, 20, 30, 40, 50]:

    pos_ind = df.loc[df.target == 1].index
    n_samples = round(k*len(pos_ind)/100)
    pos_samples = np.random.choice(pos_ind, size=n_samples, replace=False)
    data_rs = df.copy()
    data_rs['PU_labels'] = 0
    data_rs.loc[pos_samples, 'PU_labels'] = 1
    neg_ind = data_rs.loc[data_rs['PU_labels'] == 0].index
    neg_samples = np.random.choice(neg_ind, size=n_samples, replace=False)
    rs_train = pd.concat([data_rs.loc[pos_samples], data_rs.loc[neg_samples]])
    rs_test = data_rs.drop(index=(np.append(pos_samples, neg_samples)))
    # ========================================================================

    model_xgb_rs = XGBClassifier(random_state=13)
    model_xgb_rs.fit(rs_train.drop(
        columns=['target', 'PU_labels']), rs_train['PU_labels'])
    y_proba = model_xgb_rs.predict_proba(
        rs_test.drop(columns=['target', 'PU_labels']))[:, 1]
    y_pred = model_xgb_rs.predict(
        rs_test.drop(columns=['target', 'PU_labels']))
    
    result['XGBClassifier_RS_' +
           str(k)] = metrics(rs_test['target'], y_pred, y_proba)



In [16]:
pd.DataFrame(result).set_index('metrics')

Unnamed: 0_level_0,XGBClassifier_base,XGBClassifier_RS_10,XGBClassifier_RS_20,XGBClassifier_RS_30,XGBClassifier_RS_40,XGBClassifier_RS_50
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
roc_auc,0.74,0.63,0.65,0.68,0.62,0.7
f1score,0.57,0.52,0.53,0.53,0.45,0.5
precision,0.61,0.53,0.5,0.46,0.38,0.39
recall,0.54,0.51,0.58,0.62,0.55,0.67
