# Tree Based Classifier for Netflow Application Classification - Online Learning Version
Andrew Kiruluta, Netography 2022

In [1]:
from river.datasets import synth
from river import evaluate
from river import metrics
from river import tree
from river import compose
from river import compose, preprocessing, metrics, stats
import pandas as pd
import numbers
import pickle
from sklearn import preprocessing as preprocess

In [63]:
gen = synth.Agrawal(classification_function=0, seed=42)
# Take 1000 instances from the infinite data generator
dataset = iter(gen.take(1000))
dataset = pd.read_csv("KaggleImbalanced.csv")

model = tree.ExtremelyFastDecisionTreeClassifier(grace_period=100,delta=1e-5, \
        nominal_attributes=['Source.Port','Destination.Port','Protocol','L7Protocol'],\
        remove_poor_attrs = True,
        max_depth = 10,
        min_samples_reevaluate=100)

In [64]:
dataset.shape

(420502, 84)

In [65]:
set(dataset.ProtocolName)

{'AMAZON',
 'APPLE',
 'APPLE_ICLOUD',
 'APPLE_ITUNES',
 'CITRIX_ONLINE',
 'CLOUDFLARE',
 'CONTENT_FLASH',
 'DEEZER',
 'DNS',
 'DROPBOX',
 'EASYTAXI',
 'EBAY',
 'EDONKEY',
 'FACEBOOK',
 'FTP_CONTROL',
 'FTP_DATA',
 'GMAIL',
 'GOOGLE',
 'GOOGLE_MAPS',
 'HTTP',
 'HTTP_CONNECT',
 'HTTP_DOWNLOAD',
 'HTTP_PROXY',
 'INSTAGRAM',
 'IP_ICMP',
 'MICROSOFT',
 'MQTT',
 'MSN',
 'MSSQL',
 'MS_ONE_DRIVE',
 'NETFLIX',
 'NTP',
 'OFFICE_365',
 'SKYPE',
 'SPOTIFY',
 'SSH',
 'SSL',
 'SSL_NO_CERT',
 'TEAMVIEWER',
 'TELEGRAM',
 'TIMMEU',
 'TOR',
 'TWITCH',
 'TWITTER',
 'UBUNTUONE',
 'UNENCRYPED_JABBER',
 'UPNP',
 'WAZE',
 'WHATSAPP',
 'WIKIPEDIA',
 'WINDOWS_UPDATE',
 'YAHOO',
 'YOUTUBE'}

In [66]:
dataset.sample(5)

Unnamed: 0,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,Total.Length.of.Fwd.Packets,Total.Length.of.Bwd.Packets,...,Active.Mean,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,L7Protocol,ProtocolName
173603,3232242000.0,57424.383306,180881200.0,3128.0,6.0,505.1864,1.741402,1.370701,133.891832,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,HTTP_DOWNLOAD
213441,3232269000.0,57211.0,180881200.0,3128.0,6.0,14095.0,1.0,1.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212.0,MICROSOFT
5023,3232238000.0,59842.0,180881200.0,3128.0,6.0,6417565.0,6.0,27.0,829.0,903.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,178.0,AMAZON
39529,180881300.0,49612.485629,386478800.0,443.0,6.0,227.6009,3.678449,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145.0,APPLE_ITUNES
284950,3232264000.0,56270.0,180881200.0,3128.0,6.0,2589504.0,23.0,25.0,3072.0,3400.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125.0,SKYPE


# Map Features to Neto Data

In [111]:
l = list(dataset.columns)
l

['Source.IP',
 'Source.Port',
 'Destination.IP',
 'Destination.Port',
 'Protocol',
 'Flow.Duration',
 'Total.Fwd.Packets',
 'Total.Backward.Packets',
 'Total.Length.of.Fwd.Packets',
 'Total.Length.of.Bwd.Packets',
 'Fwd.Packet.Length.Max',
 'Fwd.Packet.Length.Min',
 'Fwd.Packet.Length.Mean',
 'Fwd.Packet.Length.Std',
 'Bwd.Packet.Length.Max',
 'Bwd.Packet.Length.Min',
 'Bwd.Packet.Length.Mean',
 'Bwd.Packet.Length.Std',
 'Flow.Bytes.s',
 'Flow.Packets.s',
 'Flow.IAT.Mean',
 'Flow.IAT.Std',
 'Flow.IAT.Max',
 'Flow.IAT.Min',
 'Fwd.IAT.Total',
 'Fwd.IAT.Mean',
 'Fwd.IAT.Std',
 'Fwd.IAT.Max',
 'Fwd.IAT.Min',
 'Bwd.IAT.Total',
 'Bwd.IAT.Mean',
 'Bwd.IAT.Std',
 'Bwd.IAT.Max',
 'Bwd.IAT.Min',
 'Fwd.PSH.Flags',
 'Bwd.PSH.Flags',
 'Fwd.URG.Flags',
 'Bwd.URG.Flags',
 'Fwd.Header.Length',
 'Bwd.Header.Length',
 'Fwd.Packets.s',
 'Bwd.Packets.s',
 'Min.Packet.Length',
 'Max.Packet.Length',
 'Packet.Length.Mean',
 'Packet.Length.Std',
 'Packet.Length.Variance',
 'FIN.Flag.Count',
 'SYN.Flag.Count',

In [107]:
fico = ['action', 'bits', 'bitsxrate', 'bogondst', 'bogonsrc', 'customer',
       'dstinternal', 'dstip', 'dstport', 'dstvlan', 'duration', 'end',
       'flowbrate', 'flowprate', 'flowrtime', 'flowsrcip', 'flowsrcname',
       'flowtype', 'flowversion', 'input', 'inputalias', 'inputclasses',
       'inputname', 'ipversion', 'nexthop', 'output', 'outputalias',
       'outputclasses', 'outputname', 'packets', 'packetsxrate', 'payload',
       'pbratio', 'protocolint', 'samplerate', 'site', 'srcinternal', 'srcip',
       'srcport', 'srcvlan', 'start', 'tags', 'tcpflagsint', 'timestamp',
       'tos', 'dstas.number', 'dstas.org', 'dstiprep.count',
       'dstiprep.categories', 'dstgeo.continentcode', 'dstgeo.countrycode',
       'dstgeo.location.lat', 'dstgeo.location.lon', 'dstgeo.subdiso',
       'dstowneras.number', 'dstowneras.org', 'icmp.code', 'icmp.type',
       'srcas.number', 'srcas.org', 'srciprep.count', 'srciprep.categories',
       'srcgeo.continentcode', 'srcgeo.countrycode', 'srcgeo.location.lat',
       'srcgeo.location.lon', 'srcgeo.subdiso', 'srcowneras.number',
       'srcowneras.org', 'tcpflags.urg', 'tcpflags.ack', 'tcpflags.psh',
       'tcpflags.rst', 'tcpflags.syn', 'tcpflags.fin', 'tcpflags.ece',
       'tcpflags.cwr', 'tcpflags.ns']
len(fico)

78

In [109]:
sub = 'tcp'
L = []
for text in fico:
    if sub in text.lower():
        #print(text)
        L.append(text)
print(set(L))

{'tcpflags.urg', 'tcpflags.ns', 'tcpflagsint', 'tcpflags.psh', 'tcpflags.fin', 'tcpflags.rst', 'tcpflags.ack', 'tcpflags.syn', 'tcpflags.ece', 'tcpflags.cwr'}


In [114]:
sub = 'udp'
L = []
F = []
for text in l:
    if sub in text.lower():
        #print(text)
        L.append(text)
print(set(L))

set()


'Source.IP': 'srcip'
'Destination.IP': 'dstip'
'Destination.Port': 'dstport'
'Source.Port': 'srcport'
'Bwd.Avg.Bulk.Rate': 'flowbrate'
'Fwd.Avg.Bulk.Rate': 'flowprate'
'Flow.Duration': 'duration'
'Flow.Bytes.s': 'bitsxrate'
'Flow.Packets.s': 'packetsxrate'
'Protocol': 'protocolint'
'Down.Up.Ratio': 'pbratio'




In [80]:
df =dataset.loc[dataset['ProtocolName'] == "APPLE"]
df.sample(5)

Unnamed: 0,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,Total.Length.of.Fwd.Packets,Total.Length.of.Bwd.Packets,...,Active.Mean,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,L7Protocol,ProtocolName
13437,180881400.0,33956.0,1611075000.0,443.0,6.0,11935711.0,21.0,26.0,3768.0,32146.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE
14482,180881400.0,38432.0,301570600.0,443.0,6.0,30326589.0,10.0,11.0,679.0,3973.0,...,362082.0,0.0,362082.0,362082.0,29964507.0,0.0,29964507.0,29964507.0,140.0,APPLE
13785,180881400.0,34429.0,296444500.0,443.0,6.0,53.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE
13544,180881300.0,43996.0,1611075000.0,443.0,6.0,85.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE
17971,3232246000.0,49208.0,180881200.0,3128.0,6.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE


In [123]:
sub = 'APPLE'
L = []
for text in list(dataset.ProtocolName):
    if sub in text:
        #print(text)
        L.append(text)
print(set(L))


{'APPLE_ITUNES', 'APPLE_ICLOUD', 'APPLE'}


In [133]:
sub = 'UN'
L = []
for text in list(dataset.ProtocolName):
    if sub in text:
        #print(text)
        L.append(text)
print(set(L))

{'APPLE_ITUNES', 'UNENCRYPED_JABBER', 'UBUNTUONE'}


In [111]:
#X_train = dataset.loc[(dataset['ProtocolName'] == "WINDOWS_UPDATE")]
X1 = dataset[(dataset["ProtocolName"]=="APPLE")] # | (dataset["ProtocolName"]=="WINDOWS_UPDATE")]
X1.replace("APPLE", "apple")
X1.sample(5)

Unnamed: 0,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,Total.Length.of.Fwd.Packets,Total.Length.of.Bwd.Packets,...,Active.Mean,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,L7Protocol,ProtocolName
10241,180881400.0,53400.0,297575900.0,5223.0,6.0,81649.0,2.0,2.0,6.0,53.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE
18788,3232262000.0,52622.375641,180881200.0,3128.0,6.0,434.33267,3.666534,1.0,31.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE
13011,180881300.0,38325.0,386479100.0,443.0,6.0,140.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE
11603,3232244000.0,54732.0,180881200.0,3128.0,6.0,802005.0,16.0,75.0,1916.0,12504.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE
18319,3232247000.0,49649.836568,180881200.0,3128.0,6.0,52.075271,1.0,1.95133,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,APPLE


In [110]:
X2 = dataset[(dataset["ProtocolName"]=="WINDOWS_UPDATE")]
#X2.loc[:,'ProtocolName'] = "windows"
X2=X2.replace('WINDOWS_UPDATE','windows')
X2.sample(5)

Unnamed: 0,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,Total.Length.of.Fwd.Packets,Total.Length.of.Bwd.Packets,...,Active.Mean,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,L7Protocol,ProtocolName
392875,3232259000.0,53863.0,180881160.0,3128.0,6.0,1671.0,1.0,1.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows
397470,3232272000.0,55165.0,180881160.0,3128.0,6.0,345965.0,12.0,7.0,7606.0,821.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows
399618,3232259000.0,50460.0,180881160.0,3128.0,6.0,797343.0,17.0,22.0,3826.0,9268.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows
395605,3232282000.0,49971.0,180881156.0,3128.0,6.0,57727.0,4.0,2.0,335.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows
394614,3232259000.0,64265.0,180881159.0,3128.0,6.0,400.0,1.0,1.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows


In [104]:
X_train = pd.concat([X1, X2], ignore_index=True)
X_train.sample(5)

Unnamed: 0,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,Total.Length.of.Fwd.Packets,Total.Length.of.Bwd.Packets,...,Active.Mean,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,L7Protocol,ProtocolName
10818,3232259000.0,51376.0,180881160.0,3128.0,6.0,440.0,1.0,1.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows
6370,3232246000.0,49208.0,180881160.0,3128.0,6.0,22.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,apple
5823,3232246000.0,50572.0,180881156.0,3128.0,6.0,5287755.0,12.0,8.0,929.0,652.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,apple
11469,3232259000.0,50529.0,180881159.0,3128.0,6.0,964.0,1.0,1.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows
14931,3232282000.0,50002.0,180881156.0,3128.0,6.0,56174.0,4.0,2.0,330.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,windows


In [None]:
le = preprocessing.LabelEncoder()
y = dataset['ProtocolName']
Y = le.fit_transform(y)
#metric = metrics.Accuracy()
metric = metrics.F1()

X = dataset.drop(['ProtocolName','Source.IP','Destination.IP'],axis=1).to_dict('records') #[0]
pred = []
count = 0
for x, y in zip(X,Y):
    y_pred = model.predict_one(x)
    metric = metric.update(y, y_pred)  # update the metric
    model = model.learn_one(x,y)
    pred.append(y_pred)
    if count % 10 == 0:
        print(y, y_pred)
    count +=1
print(f'Accuracy: {metric.get()}')

metric = metrics.F1()
for yt, yp in zip(Y, pred):
    metric = metric.update(yt, yp)
print(metric)

# Understanding FICO Features

In [97]:
import json
import boto3
import argparse
from s3fs import S3FileSystem
from smart_open import open

s3 = boto3.resource('s3')
s3_file = S3FileSystem()
s3_client = boto3.client("s3")
bucket = s3.Bucket('ml-flow-dump')

for obj in bucket.objects.filter(Prefix="flow"):
    source_url = 's3://ml-flow-dump/' + obj.key
    for i,json_line in enumerate(open(source_url, transport_params={"client": s3_client})):
        my_json = json.loads(json_line)
        df = pd.json_normalize(my_json) 
        l = list(df.columns)
        break
    break
print(df.columns)

Index(['action', 'bits', 'bitsxrate', 'bogondst', 'bogonsrc', 'customer',
       'dstinternal', 'dstip', 'dstport', 'dstvlan', 'duration', 'end',
       'flowbrate', 'flowprate', 'flowrtime', 'flowsrcip', 'flowsrcname',
       'flowtype', 'flowversion', 'input', 'inputalias', 'inputclasses',
       'inputname', 'ipversion', 'nexthop', 'output', 'outputalias',
       'outputclasses', 'outputname', 'packets', 'packetsxrate', 'payload',
       'pbratio', 'protocolint', 'samplerate', 'site', 'srcinternal', 'srcip',
       'srcport', 'srcvlan', 'start', 'tags', 'tcpflagsint', 'timestamp',
       'tos', 'dstas.number', 'dstas.org', 'dstiprep.count',
       'dstiprep.categories', 'dstgeo.continentcode', 'dstgeo.countrycode',
       'dstgeo.location.lat', 'dstgeo.location.lon', 'dstgeo.subdiso',
       'dstowneras.number', 'dstowneras.org', 'icmp.code', 'icmp.type',
       'srcas.number', 'srcas.org', 'srciprep.count', 'srciprep.categories',
       'srcgeo.continentcode', 'srcgeo.countrycode',

In [100]:
sub = 'icm'
L = []
for text in l:
    if sub in text.lower():
        #print(text)
        L.append(text)
print(df[L])

   icmp.code  icmp.type
0          0          0


In [105]:
df.nexthop

0    216.51.124.93
Name: nexthop, dtype: object

In [None]:
model = tree.SGTClassifier(feature_quantizer=tree.splitter.StaticQuantizer(n_bins=32, warm_start=10))

le = preprocessing.LabelEncoder()
y = dataset['ProtocolName']
Y = le.fit_transform(y)
#metric = metrics.Accuracy()
metric = metrics.F1()

X = dataset.drop(['ProtocolName','Source.IP','Destination.IP'],axis=1).to_dict('records') #[0]
pred = []
count = 0
for x, y in zip(X,Y):
    y_pred = model.predict_one(x)
    metric = metric.update(y, y_pred)  # update the metric
    model = model.learn_one(x,y)
    pred.append(y_pred)
    if count % 10 == 0:
        print(y, y_pred)
    count +=1
print(f'Accuracy: {metric.get()}')

metric = metrics.F1()
for yt, yp in zip(Y, pred):
    metric = metric.update(yt, yp)
print(metric)

In [None]:
y_pred = model.predict_proba_one(x)
y_pred

In [45]:
metric = metrics.F1()
for yt, yp in zip(Y, pred):
    metric = metric.update(yt, yp)
print(metric)

F1: 97.70%


In [None]:
cm = metrics.ConfusionMatrix()
for yt, yp in zip(Y, pred):
    cm = cm.update(yt, yp)
print(cm)

In [43]:
metric = metrics.MacroPrecision()
for yt, yp in zip(Y, pred):
    precision = metric.update(yt, yp)
print(precision)

MacroPrecision: 24.09%


In [44]:
metric = metrics.MicroRecall()
for yt, yp in zip(Y, pred):
    precision = metric.update(yt, yp)
print(precision)

MicroRecall: 26.76%


# Test Pickled Pipeline Model for Later Inference

In [6]:
def get_pipeline(model):
    #mean = stats.Mean()
    #mode = stats.Mode()
    cat = (
        compose.SelectType(str)
        | preprocessing.StatImputer()
        | preprocessing.OneHotEncoder(sparse=True)
    )
    num = compose.SelectType(numbers.Number) | preprocessing.StatImputer() | preprocessing.StandardScaler(stats.Mean())
    processor = num + cat
    return processor | model

In [34]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as preprocess
from statistics import mean
dataset = pd.read_csv("./KaggleImbalanced.csv")
dataset = dataset.head(40000)
le = preprocess.LabelEncoder()
dataset['target'] = le.fit_transform(dataset['ProtocolName'])
metric_acc = metrics.Accuracy()
pipeline = get_pipeline(model)
X_train, X_test, y_train, y_test = train_test_split(dataset, dataset.target, test_size=0.1,  random_state=42)
    
# sort data to group target since online training learns only one sample at a time
X_train = X_train.sort_values('target')
y_train = X_train.target
    
# without src & dst ip addresses:
X_train = X_train.drop(['ProtocolName','Source.IP','Destination.IP','target'],axis=1).to_dict('records')
X_test = X_test.drop(['ProtocolName','Source.IP','Destination.IP','target'],axis=1).to_dict('records')

In [35]:
dataset.shape

(40000, 85)

In [None]:
pred = []
tru = []
acc = []
count = 0

for x, y in zip(X_train,y_train):
    y_pred = pipeline.predict_one(x)
    acc.append(metric_acc.update(y, y_pred).get())  # update the metric
    pipeline.learn_one(x,y)
    if count > 0: 
        tru.append(y)
        pred.append(y_pred)
    if count % 100 == 0:
        print(y, y_pred)
        #print(count)
        #for yt, yp in zip(y, pred):
        #    metric = metric.update(yt, yp)
        #print(metric)
    count +=1
print(f'train accuracy: {mean(acc)}')
#print(model.debug_one(x)) # prediction probabilities

metric_f1 = metrics.F1()
f1 = []
for yt, yp in zip(y_train, pred):
    f1.append(metric_f1.update(yt, yp).get())
#print(metric)
print(f'train F1 score: {mean(f1)}')

In [17]:
# load pretrained model pipeline
file = open("model_AMFClassifier_2.pkl",'rb')
pipeline = pickle.load(file)
file.close()

In [20]:
pipeline

In [18]:
# Test on held out data
metric_f1 = metrics.F1()
pred = []
tru = []
f1 = []
acc = []
for x, y in zip(X_test,y_test):
    y_pred = pipeline.predict_one(x)
    tru.append(y)
    pred.append(y_pred)
    f1.append(metric_f1.update(y, y_pred).get())
    acc.append(metric_acc.update(y, y_pred).get())
print(f'test F1 score: {mean(f1)}')
print(f'test accuracy: {mean(acc)}')

test F1 score: 0.0
test accuracy: 0.28142922410118254


In [42]:
dataset = pd.read_csv("./KaggleImbalanced.csv")
le = preprocess.LabelEncoder()
dataset['target'] = le.fit_transform(dataset['ProtocolName'])
dataset.target.value_counts()

0     10000
35    10000
25    10000
1     10000
27    10000
29    10000
30    10000
31    10000
32    10000
33    10000
34    10000
36    10000
23    10000
37    10000
38    10000
41    10000
43    10000
44    10000
48    10000
49    10000
50    10000
51    10000
24    10000
26    10000
22    10000
13    10000
2     10000
3     10000
5     10000
6     10000
8     10000
21    10000
10    10000
11    10000
9     10000
18    10000
17    10000
20    10000
19    10000
52    10000
16    10000
15    10000
12       95
47       79
7        74
45       45
4        38
40       34
46       34
39       33
14       25
42       24
28       21
Name: target, dtype: int64

In [51]:
new_df = dataset.groupby('target').sample(n=20)
new_df.target.value_counts()

0     20
27    20
29    20
30    20
31    20
32    20
33    20
34    20
35    20
36    20
37    20
38    20
39    20
40    20
41    20
42    20
43    20
44    20
45    20
46    20
47    20
48    20
49    20
50    20
51    20
28    20
26    20
1     20
25    20
2     20
3     20
4     20
5     20
6     20
7     20
8     20
9     20
10    20
11    20
12    20
13    20
14    20
15    20
16    20
17    20
18    20
19    20
20    20
21    20
22    20
23    20
24    20
52    20
Name: target, dtype: int64

In [44]:
dataset.shape

(420502, 85)