Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/banknote+authentication#

In [None]:
data = "/content/Youtube02-KatyPerry.csv"

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv(data)

In [None]:
data.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z12pgdhovmrktzm3i23es5d5junftft3f,lekanaVEVO1,2014-07-22T15:27:50,i love this so much. AND also I Generate Free ...,1
1,z13yx345uxepetggz04ci5rjcxeohzlrtf4,Pyunghee,2014-07-27T01:57:16,http://www.billboard.com/articles/columns/pop-...,1
2,z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k,Erica Ross,2014-07-27T02:51:43,Hey guys! Please join me in my fight to help a...,1
3,z13jcjuovxbwfr0ge04cev2ipsjdfdurwck,Aviel Haimov,2014-08-01T12:27:48,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1
4,z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k,John Bello,2014-08-01T21:04:03,Hey everyone. Watch this trailer!!!!!!!! http...,1


In [None]:
print(data.shape)

(350, 5)


In [None]:
columns = ['AUTHOR', 'DATE']
data = data.drop(columns=columns)

In [None]:
data.iloc[:, -1].value_counts()

1    175
0    175
Name: CLASS, dtype: int64

In [None]:
data.head()

Unnamed: 0,COMMENT_ID,CONTENT,CLASS
0,z12pgdhovmrktzm3i23es5d5junftft3f,i love this so much. AND also I Generate Free ...,1
1,z13yx345uxepetggz04ci5rjcxeohzlrtf4,http://www.billboard.com/articles/columns/pop-...,1
2,z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k,Hey guys! Please join me in my fight to help a...,1
3,z13jcjuovxbwfr0ge04cev2ipsjdfdurwck,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1
4,z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k,Hey everyone. Watch this trailer!!!!!!!! http...,1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data.drop(columns = 'CLASS')
y = data['CLASS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['CONTENT'])

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.ensemble import GradientBoostingClassifier

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

pipeline = Pipeline([('selector', FeatureSelector(column='CONTENT')), 
                     ('CONTENT_tfidf', TfidfVectorizer(stop_words='english')), 
                     ('clf', GradientBoostingClassifier())])

In [None]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('selector', FeatureSelector(column='CONTENT')),
                ('CONTENT_tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', GradientBoostingClassifier())])

In [None]:
preds = pipeline.predict(X_test)
preds[:10]

array([1, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [None]:
print(preds)

[1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0
 1 0 0 0 1 1 1 1 0 0 1 0 1 1 1 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1
 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1]


Проверяем качество

In [None]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, preds)

Classification results:
f1: 94.00%
roc: 94.25%
recall: 90.38%
precision: 97.92%


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [None]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.5 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 88/175 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [None]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    262
 1     88
Name: class_test, dtype: int64


In [None]:
mod_data.head(10)

Unnamed: 0,COMMENT_ID,CONTENT,CLASS,class_test
0,z12pgdhovmrktzm3i23es5d5junftft3f,i love this so much. AND also I Generate Free ...,1,1
1,z13yx345uxepetggz04ci5rjcxeohzlrtf4,http://www.billboard.com/articles/columns/pop-...,1,-1
2,z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k,Hey guys! Please join me in my fight to help a...,1,1
3,z13jcjuovxbwfr0ge04cev2ipsjdfdurwck,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1,1
4,z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k,Hey everyone. Watch this trailer!!!!!!!! http...,1,1
5,z12rw1o4zvidhdthz04cixxjssq5wzsrlpk0k,check out my rapping hope you guys like it ht...,1,-1
6,z13xizvwrki2hf2ev22txvrp2ovcyf3zq04,"Subscribe pleaaaase to my instagram account , ...",1,-1
7,z12ogvgbmre3eloah04ccjbpsmusxdxbwc0,hey guys!! visit my channel pleaase (i'm searc...,1,-1
8,z125efjyoyaxwhzhz04cgh4oaontcvvdc,Nice! http://www.barnesandnoble.com/s/BDP?csrf...,1,1
9,z12is34ysrzoy3uwl04cctlxmrekjfuhvig,http://www.twitch.tv/daconnormc﻿,1,1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [None]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 1. random negative sampling

In [None]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(88, 4) (88, 4)


In [None]:
pipeline_PU = Pipeline([('selector', FeatureSelector(column='CONTENT')), 
                     ('CONTENT_tfidf', TfidfVectorizer(stop_words='english')), 
                     ('clf', GradientBoostingClassifier())])

In [None]:
cols = ['class_test', 'CLASS']
x = sample_train.drop(columns=cols)
x_test = sample_test.drop(columns=cols)

In [None]:
pipeline_PU.fit(x, sample_train.iloc[:,-2].values)

Pipeline(steps=[('selector', FeatureSelector(column='CONTENT')),
                ('CONTENT_tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', GradientBoostingClassifier())])

In [None]:
y_predict = pipeline_PU.predict(x_test)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 88.89%
roc: 92.39%
recall: 90.57%
precision: 87.27%


По метрикам PU отработал чуть хуже, чем GradBoost.