In [1]:
import pandas as pd
import random
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('data.csv', sep=';').iloc[:,1:]

In [3]:
df_train

Unnamed: 0,ip,200_proc,300_proc,page_proc,400_proc,night_proc,number_requests,request_image,type_user
0,54.131.75.205,30.953,21.065,16.964,8.030,1.053,149,20.516,human
1,187.231.153.4,48.324,38.419,23.293,3.588,15.171,26,24.339,human
2,154.145.184.6,47.418,21.994,19.241,7.365,0.319,128,13.913,human
3,44.52.240.50,26.554,49.513,25.753,3.120,19.842,179,8.323,human
4,144.85.254.116,32.210,45.793,12.566,1.535,10.783,174,15.532,human
...,...,...,...,...,...,...,...,...,...
595,157.33.215.28,30.341,33.183,37.255,25.268,27.245,139,8.387,crawler
596,240.162.106.180,18.973,32.929,51.447,29.557,25.113,70,7.778,crawler
597,211.57.124.64,50.522,31.376,31.439,1.624,38.779,153,9.877,human
598,187.60.48.17,34.885,28.095,34.586,38.400,32.009,89,4.918,crawler


In [4]:
df_train, df_test = train_test_split(df_train, test_size=0.25)

In [5]:
df_train['type_user'].value_counts()

human      371
crawler     79
Name: type_user, dtype: int64

In [6]:
df_test['type_user'].value_counts()

human      129
crawler     21
Name: type_user, dtype: int64

In [7]:
df_train = df_train.iloc[:,1:]

In [8]:
ip_test = df_test.iloc[:, 0]
df_test = df_test.iloc[:, 1:]

In [9]:
x_train, y_train = df_train.drop(['type_user'], axis = 1), df_train.loc[:, 'type_user']
x_test, y_test = df_test.drop(['type_user'], axis = 1), df_test.loc[:, 'type_user']

In [10]:
minmax = MinMaxScaler()
b = minmax.fit_transform(x_train)

x_train = b
x_train

array([[0.1538549 , 0.32974032, 0.37347729, ..., 0.16305084, 0.78888889,
        0.14834804],
       [0.03956074, 0.33662853, 0.90670198, ..., 0.97732688, 0.19444444,
        0.06929597],
       [0.93056408, 0.42879788, 0.15265838, ..., 0.06373053, 0.16666667,
        0.68531396],
       ...,
       [0.77652729, 0.40490532, 0.42894683, ..., 0.00323543, 0.54444444,
        0.13661996],
       [0.15724257, 0.03928294, 0.54019149, ..., 0.96967721, 0.98333333,
        0.11160699],
       [0.34306437, 0.52358994, 0.75913628, ..., 0.70113616, 0.76666667,
        0.17813527]])

In [11]:
b = minmax.transform(x_test)

x_test = b
x_test

array([[0.39956347, 0.80283598, 0.05966592, ..., 0.54084422, 0.43333333,
        0.05808684],
       [0.80494737, 0.36150099, 0.20790385, ..., 0.42499561, 0.43333333,
        0.20484345],
       [0.5475752 , 0.2657002 , 0.21324099, ..., 0.8623561 , 0.82222222,
        0.37816987],
       ...,
       [1.00125048, 0.5821829 , 0.31884294, ..., 0.56023175, 0.16666667,
        0.47905207],
       [0.5773367 , 0.28437617, 0.07909961, ..., 0.18610017, 0.53888889,
        0.59889293],
       [0.70659118, 0.34478485, 0.03310246, ..., 0.45002633, 0.15      ,
        0.22760768]])

In [12]:
class PNN:
    
    def __init__(self, X, y):
        
        self.w = X
        self.clss = np.unique(y)
        self.mask = [y == clss 
                     for clss in np.unique(y)]
        
    
    def predict(self, x):
        
        image_layer = np.zeros(self.w.shape[0])
        addition_layer = np.zeros(len(self.mask))
        
        for i in range(self.w.shape[0]):
            for j in range(self.w.shape[1]):
                
                image_layer[i] += np.e ** ( -(self.w[i,j] - x[j])**2 / 0.3**2 )             
        
        for k, mask in enumerate(self.mask):
            addition_layer[k] = np.sum(image_layer * mask) / np.sum(mask)
        
        return self.clss[np.argmax(addition_layer)]

In [13]:
y_test.value_counts()

human      129
crawler     21
Name: type_user, dtype: int64

In [14]:
y_train.value_counts()

human      371
crawler     79
Name: type_user, dtype: int64

In [15]:
pnn = PNN(x_train, y_train)

In [16]:
result_pred = list(map(lambda x: pnn.predict(x), x_test))

In [17]:
df_result = pd.DataFrame(columns=['ip','real_type_user','pred_type_user'])
df_result['ip'] = ip_test.values 
df_result['real_type_user'] = y_test.values
df_result['pred_type_user'] = result_pred
df_result

Unnamed: 0,ip,real_type_user,pred_type_user
0,185.116.252.184,human,human
1,90.119.247.70,human,human
2,209.17.83.195,human,human
3,184.117.252.9,human,crawler
4,24.201.54.13,human,human
...,...,...,...
145,147.51.179.253,human,human
146,109.177.65.72,human,human
147,99.174.13.84,human,human
148,68.119.143.234,human,human


In [18]:
result_error = sum(list(map(lambda x, y: int(x==y),result_pred,y_test.to_numpy())))

In [19]:
err =  round((len(result_pred)-result_error)/len(result_pred)*100,3)
print('accuracy: %s / %s; %s%% - error'%(result_error, len(result_pred),err))

accuracy: 143 / 150; 4.667% - error


In [20]:
df_result[df_result.real_type_user!=df_result.pred_type_user]

Unnamed: 0,ip,real_type_user,pred_type_user
3,184.117.252.9,human,crawler
10,161.182.29.56,human,crawler
35,44.101.99.73,human,crawler
38,176.36.28.36,human,crawler
42,132.129.214.167,human,crawler
44,202.204.63.203,human,crawler
120,86.118.72.104,human,crawler


In [21]:
df_result[df_result.pred_type_user=='crawler'].ip.values

array(['184.117.252.9', '161.182.29.56', '38.110.120.105',
       '158.22.246.105', '124.240.66.182', '215.126.225.178',
       '241.11.36.117', '242.191.68.207', '44.101.99.73', '176.36.28.36',
       '132.129.214.167', '202.204.63.203', '171.142.156.4',
       '46.171.125.244', '161.18.90.147', '42.251.0.243',
       '139.130.236.95', '222.163.89.170', '20.67.80.162',
       '59.67.174.229', '243.227.104.139', '218.104.71.187',
       '90.151.11.30', '74.248.112.208', '251.128.251.123',
       '86.118.72.104', '209.116.196.96', '213.240.166.85'], dtype=object)