## Урок 6. Задача look-alike
### 1. взять любой набор данных для бинарной классификации

Описание данных - https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+<br/>
Для обучения и тестирования представляются три набора данных. Загружу и просмотрю все три набора:

In [1]:
import pandas as pd
import numpy as np
data_1 = pd.read_csv("datatest.txt")
data_2 = pd.read_csv("datatest2.txt")
data_3 = pd.read_csv("datatraining.txt")
#data = pd.read_csv("data_banknote_authentication.txt", header=None)
data_1.head(3)

Unnamed: 0,num,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,140,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
1,141,2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1
2,142,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1


In [2]:
data_2.head(3)

Unnamed: 0,num,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,1,2015-02-11 14:48:00,21.76,31.133333,437.333333,1029.666667,0.005021,1
1,2,2015-02-11 14:49:00,21.79,31.0,437.333333,1000.0,0.005009,1
2,3,2015-02-11 14:50:00,21.7675,31.1225,434.0,1003.75,0.005022,1


In [3]:
data_3.head(3)

Unnamed: 0,num,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
1,2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
2,3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1


In [4]:
data_1.count(), data_2.count(), data_3.count()

(num              2665
 date             2665
 Temperature      2665
 Humidity         2665
 Light            2665
 CO2              2665
 HumidityRatio    2665
 Occupancy        2665
 dtype: int64,
 num              9752
 date             9752
 Temperature      9752
 Humidity         9752
 Light            9752
 CO2              9752
 HumidityRatio    9752
 Occupancy        9752
 dtype: int64,
 num              8143
 date             8143
 Temperature      8143
 Humidity         8143
 Light            8143
 CO2              8143
 HumidityRatio    8143
 Occupancy        8143
 dtype: int64)

### 2. сделать feature engineering

Объединю все три набора и удалю первые три поля: "num", "date" и "Temperature":

In [5]:
data = pd.concat([data_1, data_2, data_3], ignore_index=True)
data.drop(['num', 'date', 'Temperature'], inplace=True, axis = 1)
data.head(3)

Unnamed: 0,Humidity,Light,CO2,HumidityRatio,Occupancy
0,26.272,585.2,749.2,0.004764,1
1,26.29,578.4,760.4,0.004773,1
2,26.23,572.666667,769.666667,0.004765,1


У нас есть 4 признака и 1 целевая переменная (бинарная) - нужно определить занято офисное помещение или нет.

In [6]:
print(f'Всего {len(data.index)} наблюдений')

Всего 20560 наблюдений


Посмотрим на соотношение классов

In [7]:
data.iloc[:, -1].value_counts()

0    15810
1     4750
Name: Occupancy, dtype: int64

### 3. обучить любой классификатор (какой вам нравится)

Разбиваем выборку на тренировочную и тестовую части и обучаем модель (RandomForest)

In [8]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=7)

In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=7)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

Проверяем качество

In [10]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0))

    
evaluate_results(y_test, y_predict)

evaluate_metrics = {'metrics' : ['f1', 'roc', 'recall', 'precision'],
                    'RandomForestClassifier' : [f1_score(y_test, y_predict), roc_auc_score(y_test, y_predict),
                                                recall_score(y_test, y_predict, average='binary'),
                                                precision_score(y_test, y_predict, average='binary')]}
RandomForest_metrics = pd.DataFrame(evaluate_metrics)

Classification results:
f1: 98.31%
roc: 99.22%
recall: 99.22%
precision: 97.41%


In [11]:
first = RandomForest_metrics.set_index('metrics')
first

Unnamed: 0_level_0,RandomForestClassifier
metrics,Unnamed: 1_level_1
f1,0.983075
roc,0.992201
recall,0.992171
precision,0.974144


### 4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

Представим, что нам неизвестны негативы и часть позитивов

In [12]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1188/4750 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [13]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    19372
 1     1188
Name: class_test, dtype: int64


* We now have just 153 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col 4 still holds the actual label

In [14]:
mod_data.head(10)

Unnamed: 0,Humidity,Light,CO2,HumidityRatio,Occupancy,class_test
0,26.272,585.2,749.2,0.004764,1,-1
1,26.29,578.4,760.4,0.004773,1,1
2,26.23,572.666667,769.666667,0.004765,1,-1
3,26.125,493.75,774.75,0.004744,1,-1
4,26.2,488.6,779.0,0.004767,1,-1
5,26.26,568.666667,790.0,0.004779,1,-1
6,26.29,536.333333,798.0,0.004776,1,-1
7,26.29,509.0,797.0,0.004783,1,-1
8,26.35,476.0,803.2,0.004794,1,-1
9,26.39,510.0,809.0,0.004796,1,-1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [15]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 5. применить random negative sampling для построения классификатора в новых условиях

In [16]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1188, 6) (1188, 6)


In [17]:
model = RandomForestClassifier(random_state=7)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

evaluate_metrics = {'metrics' : ['f1', 'roc', 'recall', 'precision'],
                    'RandomForestClassifier RN sampling' : [f1_score(sample_test.iloc[:,-2].values, y_predict),
                                                roc_auc_score(sample_test.iloc[:,-2].values, y_predict),
                                                recall_score(sample_test.iloc[:,-2].values, y_predict, average='binary'),
                                                precision_score(sample_test.iloc[:,-2].values, y_predict, average='binary')]}
RandomForest_RNS_metrics = pd.DataFrame(evaluate_metrics)

Classification results:
f1: 97.34%
roc: 99.13%
recall: 99.31%
precision: 95.46%


In [18]:
second = RandomForest_RNS_metrics.set_index('metrics')
second

Unnamed: 0_level_0,RandomForestClassifier RN sampling
metrics,Unnamed: 1_level_1
f1,0.973436
roc,0.991256
recall,0.993074
precision,0.954559


### 6. Cравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)

In [19]:
table = round(pd.concat([first,second], axis=1)*100, 2).astype(str) + '%'
table

Unnamed: 0_level_0,RandomForestClassifier,RandomForestClassifier RN sampling
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
f1,98.31%,97.34%
roc,99.22%,99.13%
recall,99.22%,99.31%
precision,97.41%,95.46%


<b>Бонусный вопрос:</b>

Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?

Ваш ответ здесь:

Думаю, 2-step approach более предпочтителен: random negative sampling позволяет изменять только небольшой процент весов.