In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [3]:
import matplotlib.pyplot as plt

%matplotlib inline

In [9]:
bank = pd.read_csv("data_banknote_authentication.txt", header=None)
bank.head()

Unnamed: 0,0,1,2,3,4
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [27]:
bank.describe()

Unnamed: 0,0,1,2,3,4
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,0.433735,2.061069,1.397627,-1.191657,0.444606
std,2.842763,5.151982,4.31003,2.101013,0.497103
min,-7.0421,-10.4519,-5.2861,-8.5482,0.0
25%,-1.773,-0.81272,-1.574975,-2.41345,0.0
50%,0.49618,2.31965,0.61663,-0.58665,0.0
75%,2.821475,6.27145,3.17925,0.39481,1.0
max,6.8248,11.1079,17.9274,2.4495,1.0


In [23]:
bank.isna().sum()

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [24]:
max_value = np.quantile(bank[1], q=0.975)
min_value = np.quantile(bank[1], q=0.025)
condition = (bank[1] > max_value) | (bank[1] < min_value)
bank[condition]

Unnamed: 0,0,1,2,3,4
59,-0.78289,11.3603,-0.37644,-7.04950,0
62,4.25860,11.2962,-4.09430,-4.34570,0
99,-2.74190,11.4038,2.53940,-5.57930,0
126,-2.34300,12.9516,3.32850,-5.94260,0
135,4.16050,11.2196,-3.61360,-4.08190,0
...,...,...,...,...,...
1308,-4.63380,-12.7509,16.71660,-3.21680,1
1314,-3.50600,-12.5667,15.16060,-0.75216,1
1329,-2.96720,-13.2869,13.47270,-2.62710,1
1336,-2.05450,-10.8679,9.49260,-1.41160,1


In [26]:
bank.loc[condition, 1] = bank[1].median()

In [29]:
x_data = bank.iloc[:,:-1]
y_data = bank.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=1)

In [40]:
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [37]:
model = SGDClassifier(loss='log',random_state = 1, max_iter=2000)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [35]:
model2 = xgb.XGBClassifier()

model2.fit(x_train, y_train)
y_predict2 = model2.predict(x_test)





In [41]:
model3 = RandomForestClassifier(n_estimators=50,random_state = 42, max_depth=5)

model3.fit(x_train, y_train)
y_predict3 = model3.predict(x_test)

In [32]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

In [38]:
evaluate_results(y_test, y_predict)

Classification results:
f1: 88.43%
recall: 90.68%
precision: 86.29%


In [36]:
evaluate_results(y_test, y_predict2)

Classification results:
f1: 99.57%
recall: 99.15%
precision: 100.00%


In [42]:
evaluate_results(y_test, y_predict3)

Classification results:
f1: 98.33%
recall: 100.00%
precision: 96.72%


In [78]:
mod_bank = bank.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_bank.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.4 * len(pos_ind)))
print(f'Используем {pos_sample_len}/{len(pos_ind)} как позитивные и делаем неразмеченными остальные')
pos_sample = pos_ind[:pos_sample_len]

Используем 244/610 как позитивные и делаем неразмеченными остальные


In [79]:
mod_bank['class_test'] = -1
mod_bank.loc[pos_sample,'class_test'] = 1
print('Целевых переменных:\n', mod_bank.iloc[:,-1].value_counts())

Целевых переменных:
 -1    1128
 1     244
Name: class_test, dtype: int64


In [80]:
x_data2 = mod_bank.iloc[:,:-2].values # только X 
y_labeled = mod_bank.iloc[:,-1].values # класс для PUL ( P & U)
y_positive = mod_bank.iloc[:,-2].values # истиная разметка

In [81]:
mod_bank = mod_bank.sample(frac=1)
neg_sample = mod_bank[mod_bank['class_test']==-1][:len(mod_bank[mod_bank['class_test']==1])]
sample_test = mod_bank[mod_bank['class_test']==-1][len(mod_bank[mod_bank['class_test']==1]):]
pos_sample = mod_bank[mod_bank['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(244, 6) (244, 6)


In [82]:
model_rns = RandomForestClassifier(n_estimators=50,random_state = 42, max_depth=5)
sample_train.loc[sample_train['class_test']==-1,'class_test']=0
model_rns.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-1].values)
y_predict_rns = model_rns.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict_rns)

Classification results:
f1: 94.45%
recall: 92.63%
precision: 96.35%


In [57]:
evaluate_results(y_test, y_predict3)

Classification results:
f1: 98.33%
recall: 100.00%
precision: 96.72%
