In [1442]:
# 1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
# 2. сделать feature engineering
# 3. обучить любой классификатор (какой вам нравится)
# 4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
# 5. применить random negative sampling для построения классификатора в новых условиях
# 6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
# 7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1443]:
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix, log_loss
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from datetime import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
import random
import warnings
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
random.seed(42)

In [1444]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML в бизнесе/bank.csv', sep=';')

In [1445]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [1446]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [1447]:
X = data.copy().drop(['y'], axis=1)
y = data['y']

In [1448]:
y.value_counts()

no     4000
yes     521
Name: y, dtype: int64

In [1449]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [1450]:
# Функция для разделения категориальных и числовых признаков
def split_features(data):
  cat_feats = []
  num_feats = []

  for i in range(len(data.columns)):
    if type(data[data.columns[i]][0]) == str:
      cat_feats.append(data.columns[i])
    else:
      num_feats.append(data.columns[i])

  return cat_feats, num_feats

In [1451]:
# Функция для формирования списка названий трансформированных признаков
def get_features_transformed(model):

    features_transformed = []

    for i in range(len(model[0].transformer_list)):
        if model[0].transformer_list[i][0] in cat_feats:
            temp = model.named_steps['feats'].transformer_list[i][1].named_steps['ohe'].columns
            [features_transformed.append(i) for i in temp]
        else:
            features_transformed.append(model[0].transformer_list[i][1][0].key)
    return features_transformed

In [1452]:
table_comp = pd.DataFrame({'Model': [],
                           'Threshold': [],
                           'F-Score': [],
                           'Precision': [],
                           'Recall': [],
                           'ROC AUC': [],
                           'TP': [],
                           'FP': [],
                           'TN': [],
                           'FN': []})

In [1453]:
cat_feats, num_feats = split_features(X)

In [1454]:
# Отберем признаки, в которых есть ответы только 'yes' и 'no'
feats_y_n = []
for i in cat_feats:
  if X[i][0] == 'yes' or X[i][0] == 'no':
    feats_y_n.append(i)
feats_y_n

['default', 'housing', 'loan']

In [1455]:
for i in feats_y_n:
  if i in cat_feats:
    cat_feats.remove(i)
cat_feats

['job', 'marital', 'education', 'contact', 'month', 'poutcome']

In [1456]:
# Заменяем значения 'yes' и 'no' на 1 и 0
for y_n_col in feats_y_n:
  X.replace({y_n_col: {'yes': 1, 'no': 0}}, inplace=True)

num_feats = num_feats + feats_y_n

# Заменяем значения 'yes' и 'no' на 1 и 0 в таргете
y.replace({'yes': 1, 'no': 0}, inplace=True)
y.value_counts()

0    4000
1     521
Name: y, dtype: int64

In [1457]:
# Набор трансформеров для признаков
final_transformers = list()

# Делаем dummy признаки из категориальных
for cat_col in cat_feats:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)), 
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))

# Отберем оставшиеся признаки
for col in num_feats:
    non_transformer = Pipeline([
                ('selector', NumberSelector(key=col))
            ])
    final_transformers.append((col, non_transformer))
    

feats_final = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats_final)])

# Вытащим названия трансформированных признаков
feature_processing.fit(X_train)
feats_trans_names = get_features_transformed(feature_processing)

In [1458]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.3)

In [1459]:
pipeline = Pipeline([
    ('features', feats_final),
    ('classifier', RandomForestClassifier(n_estimators=150, max_depth=None, min_samples_leaf=2, random_state=42)),
])

pipeline = pipeline.fit(X_train, y_train)
preds_rf = pipeline.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, preds_rf)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

cnf_matrix = confusion_matrix(y_test, preds_rf>thresholds[ix])
table_comp.loc[0] = ['RandomForestClassifier', thresholds[ix], fscore[ix], precision[ix], recall[ix], roc_auc_score(y_test, preds_rf), cnf_matrix[1][1], cnf_matrix[0][1], cnf_matrix[0][0], cnf_matrix[1][0]]

Best Threshold=0.248115, F-Score=0.557, Precision=0.507, Recall=0.619


### **Random negative sampling**

In [1460]:
frac = [0.4, 0.45, 0.5, 0.65, 0.7]

for i in range(len(frac)):
  # Сформируем датасет с таргетом
  X_train_with_target = pd.concat([X_train, y_train], axis=1)

  # Отберем только позитивные примеры
  X_train_positive = X_train_with_target[X_train_with_target['y'] == 1]

  # Сформируем выборку из позитивных примеров
  data_positive = X_train_positive.sample(frac=frac[i], replace=True, random_state=42, axis=0)

  # Сформируем выборку unlabeled
  data_unlabeled = X_train_with_target.drop(labels = data_positive.index, axis = 0)

  # Воспользуемся методом Random negative sampling
  data_negative = data_unlabeled.sample(n=data_positive.shape[0], replace=True, random_state=42, axis=0)

  # Сформируем новый датасет
  data_new = pd.concat([data_positive, data_negative])
  X_new = data_new.copy().drop(['y'], axis=1)
  y_new = data_new['y']

  pipeline = Pipeline([
      ('features', feats_final),
      ('classifier', RandomForestClassifier(n_estimators=150, max_depth=None, min_samples_leaf=2, random_state=42)),
  ])

  pipeline_rns = pipeline.fit(X_new, y_new)
  preds_rns = pipeline_rns.predict_proba(X_test)[:, 1]

  precision, recall, thresholds = precision_recall_curve(y_test, preds_rns)
  fscore = (2 * precision * recall) / (precision + recall)
  ix = np.argmax(fscore)

  cnf_matrix = confusion_matrix(y_test, preds_rns>thresholds[ix])
  table_comp.loc[i+1] = [f'Random negative sampling_{int(frac[i]*100)}%', thresholds[ix], fscore[ix], precision[ix], recall[ix], roc_auc_score(y_test, preds_rns), cnf_matrix[1][1], cnf_matrix[0][1], cnf_matrix[0][0], cnf_matrix[1][0]]

In [1461]:
table_comp

Unnamed: 0,Model,Threshold,F-Score,Precision,Recall,ROC AUC,TP,FP,TN,FN
0,RandomForestClassifier,0.248115,0.557214,0.506787,0.618785,0.896393,223.0,218.0,2585.0,139.0
1,Random negative sampling_40%,0.602494,0.426023,0.343486,0.560773,0.820308,202.0,388.0,2415.0,160.0
2,Random negative sampling_45%,0.533456,0.426456,0.331797,0.596685,0.814677,215.0,435.0,2368.0,147.0
3,Random negative sampling_50%,0.558009,0.441341,0.332865,0.654696,0.833055,236.0,475.0,2328.0,126.0
4,Random negative sampling_65%,0.537114,0.462535,0.347705,0.690608,0.851768,249.0,469.0,2334.0,113.0
5,Random negative sampling_70%,0.560012,0.465549,0.351124,0.690608,0.850394,249.0,462.0,2341.0,113.0


**Вывод:** Метод Случайного леса показывает лучшие метрики, чем метод Random negative sampling. При этом если мы увеличиваем выборку позитивных примеров, точность модели повышается, и наоборот, если уменьшаем выборку - точность падает.