In [241]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from collections import Counter

In [242]:
df = pd.read_csv("./data/train_dataset.csv")

In [243]:
df.head()

Unnamed: 0,domain,length,is_dga
0,tgynbbsiyv-ecfrdt.ru,20,1
1,bvokgcwqlnjlieemtthw-homqchjsmqkqg.ba,37,1
2,rkcp.sa,7,1
3,flightbasketball.com,20,0
4,lkkogmbqulojf-follow.se,23,1


In [244]:
df.describe()

Unnamed: 0,length,is_dga
count,3000000.0,3000000.0
mean,16.27779,0.666667
std,6.593647,0.4714045
min,4.0,0.0
25%,12.0,0.0
50%,15.0,1.0
75%,19.0,1.0
max,73.0,1.0


In [245]:
df.isna().sum()

domain    0
length    0
is_dga    0
dtype: int64

In [246]:
df['tld'] = df['domain'].map(lambda x: x.split(".")[-1])

In [247]:
df.columns

Index(['domain', 'length', 'is_dga', 'tld'], dtype='object')

In [248]:
df['tld']

0           ru
1           ba
2           sa
3          com
4           se
          ... 
2999995     ru
2999996     pe
2999997     mx
2999998     au
2999999     za
Name: tld, Length: 3000000, dtype: object

In [249]:
df['dash_count'] = df['domain'].map(lambda x: len(x.split('-')) - 1)

In [250]:
df[df['is_dga'] == 0]

Unnamed: 0,domain,length,is_dga,tld,dash_count
3,flightbasketball.com,20,0,com,0
5,actouch.com,11,0,com,0
7,mideastoffers.com,17,0,com,0
12,1bios.net,9,0,net,0
17,ej-technologies.com,19,0,com,1
...,...,...,...,...,...
2999983,31ventures.jp,13,0,jp,0
2999984,hrconnection.com,16,0,com,0
2999986,findyoutube.net,15,0,net,0
2999993,imh.com.sg,10,0,sg,0


In [251]:
df[df['is_dga'] == 1]

Unnamed: 0,domain,length,is_dga,tld,dash_count
0,tgynbbsiyv-ecfrdt.ru,20,1,ru,1
1,bvokgcwqlnjlieemtthw-homqchjsmqkqg.ba,37,1,ba,1
2,rkcp.sa,7,1,sa,0
4,lkkogmbqulojf-follow.se,23,1,se,1
6,oinlgzuwdbeiodnpc-kxej.ad,25,1,ad,1
...,...,...,...,...,...
2999995,mtkdvmusicfde.ru,16,1,ru,0
2999996,rkzanzenncqrn-week.pe,21,1,pe,1
2999997,trafficddijjnhtygjyvsfpqwbghv.mx,32,1,mx,0
2999998,ncxcbltxvhkyiopufvnp-idbyw.au,29,1,au,1


In [252]:
df['tld_len'] = df['tld'].map(lambda x: len(x))

In [253]:
# You can count number of n-grams using next formula: max(0, k - n + 1), where's k is length of string, n is length of ngram

In [254]:
df["2grams"] = df['domain'].map(lambda x: max(0, len(x) - 2 + 1))

In [255]:
df["3grams"] = df['domain'].map(lambda x: max(0, len(x) - 3 + 1))

In [256]:
df

Unnamed: 0,domain,length,is_dga,tld,dash_count,tld_len,2grams,3grams
0,tgynbbsiyv-ecfrdt.ru,20,1,ru,1,2,19,18
1,bvokgcwqlnjlieemtthw-homqchjsmqkqg.ba,37,1,ba,1,2,36,35
2,rkcp.sa,7,1,sa,0,2,6,5
3,flightbasketball.com,20,0,com,0,3,19,18
4,lkkogmbqulojf-follow.se,23,1,se,1,2,22,21
...,...,...,...,...,...,...,...,...
2999995,mtkdvmusicfde.ru,16,1,ru,0,2,15,14
2999996,rkzanzenncqrn-week.pe,21,1,pe,1,2,20,19
2999997,trafficddijjnhtygjyvsfpqwbghv.mx,32,1,mx,0,2,31,30
2999998,ncxcbltxvhkyiopufvnp-idbyw.au,29,1,au,1,2,28,27


In [257]:
X = df.drop(['is_dga', "domain"], axis = 1)

In [258]:
y = df['is_dga']

In [259]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = True)

In [260]:
train_pool = Pool(X_train, y_train, cat_features = ['tld'])
test_pool = Pool(X_test, y_test, cat_features = ['tld'])

In [261]:
model = CatBoostClassifier(task_type='GPU',
                           iterations=50,
                           depth=6,
                           learning_rate=0.1,
                           loss_function='CrossEntropy',
                           verbose=True)

In [262]:
model.fit(train_pool, eval_set = test_pool, use_best_model = True, verbose = True)

0:	learn: 0.6423019	test: 0.6422793	best: 0.6422793 (0)	total: 16ms	remaining: 782ms
1:	learn: 0.6004369	test: 0.6003874	best: 0.6003874 (1)	total: 30.3ms	remaining: 726ms
2:	learn: 0.5658209	test: 0.5657497	best: 0.5657497 (2)	total: 44.5ms	remaining: 698ms
3:	learn: 0.5370489	test: 0.5369549	best: 0.5369549 (3)	total: 58.8ms	remaining: 677ms
4:	learn: 0.4958361	test: 0.4956691	best: 0.4956691 (4)	total: 73.6ms	remaining: 662ms
5:	learn: 0.4609034	test: 0.4606474	best: 0.4606474 (5)	total: 88.8ms	remaining: 651ms
6:	learn: 0.4311792	test: 0.4308490	best: 0.4308490 (6)	total: 104ms	remaining: 639ms
7:	learn: 0.4056459	test: 0.4052704	best: 0.4052704 (7)	total: 120ms	remaining: 630ms
8:	learn: 0.3837065	test: 0.3832405	best: 0.3832405 (8)	total: 135ms	remaining: 616ms
9:	learn: 0.3647274	test: 0.3641686	best: 0.3641686 (9)	total: 150ms	remaining: 601ms
10:	learn: 0.3479827	test: 0.3473356	best: 0.3473356 (10)	total: 165ms	remaining: 586ms
11:	learn: 0.3333255	test: 0.3326367	best: 0.332

<catboost.core.CatBoostClassifier at 0x2d03d4cdfd0>

In [263]:
metrics = model.eval_metrics(test_pool, metrics = ['Precision', 'Recall', 'F1', 'AUC'])
precision = metrics['Precision']
recall = metrics['Recall']
f1 = metrics['F1']
auc = metrics['AUC']

In [264]:
precision[-1]

0.9177530115006546

In [265]:
recall[-1]

0.9492938870219235

In [266]:
f1[-1]

0.9332570320573518

In [267]:
auc[-1]

0.9677897329055695

In [300]:
real_df = pd.read_csv("./data/packets_from_pcap.csv", encoding_errors = 'ignore')

  real_df = pd.read_csv("./data/packets_from_pcap.csv", encoding_errors = 'ignore')


In [301]:
real_df.isna().sum()

No.                  0
Time                 0
Source               0
Destination          0
Length               0
Name           4163241
Name Length    4163241
dtype: int64

In [302]:
real_df = real_df.dropna()

In [303]:
real_df

Unnamed: 0,No.,Time,Source,Destination,Length,Name,Name Length
835,836,1.083720e+03,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
836,837,1.087156e+03,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
837,838,1.091741e+03,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
840,841,1.100282e+03,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
841,842,1.118605e+03,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
...,...,...,...,...,...,...,...
7624591,7624592,5.054352e+07,10.250.18.22,20.2.7.51,115,<Unknown extended label>,24
7624592,7624593,5.054352e+07,47.5.99.35,10.250.33.37,379,<Unknown extended label>,24
7624594,7624595,5.054352e+07,10.250.23.27,93.184.216.34,379,<Unknown extended label>,24
7624596,7624597,5.054352e+07,10.250.33.37,42.2.5.24,379,<Unknown extended label>,24


In [304]:
real_df = real_df.drop(["No.", "Time"], axis = 1)

In [305]:
real_df

Unnamed: 0,Source,Destination,Length,Name,Name Length
835,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
836,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
837,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
840,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
841,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
...,...,...,...,...,...
7624591,10.250.18.22,20.2.7.51,115,<Unknown extended label>,24
7624592,47.5.99.35,10.250.33.37,379,<Unknown extended label>,24
7624594,10.250.23.27,93.184.216.34,379,<Unknown extended label>,24
7624596,10.250.33.37,42.2.5.24,379,<Unknown extended label>,24


In [306]:
real_df = real_df[real_df["Name"] != "<Unknown extended label>"]

In [307]:
real_df

Unnamed: 0,Source,Destination,Length,Name,Name Length
835,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
836,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
837,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
840,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
841,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
...,...,...,...,...,...
7602912,10.10.15.122,224.0.0.251,119,"_674A0243._sub._googlecast._tcp.local,_8E6C866...",373722
7602913,fe80::3028:8aff:fe08:f2dd,ff02::fb,139,"_674A0243._sub._googlecast._tcp.local,_8E6C866...",373722
7605614,fe80::6e30:c1ca:8206:6a95,ff02::fb,105,_microsoft_mcc._tcp.local,25
7607600,10.10.15.31,224.0.0.251,85,_microsoft_mcc._tcp.local,25


In [308]:
real_df = real_df.drop(real_df[real_df["Name Length"].str.count(',') > 0].index)

In [309]:
real_df

Unnamed: 0,Source,Destination,Length,Name,Name Length
835,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
836,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
837,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
840,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
841,192.168.200.90,37.221.193.103,73,grafin.ru,9.0
...,...,...,...,...,...
7572877,10.10.15.53,224.0.0.251,73,w-me-10.local,13
7574410,10.10.15.53,224.0.0.251,67,w-me-10,7
7605614,fe80::6e30:c1ca:8206:6a95,ff02::fb,105,_microsoft_mcc._tcp.local,25
7607600,10.10.15.31,224.0.0.251,85,_microsoft_mcc._tcp.local,25


In [310]:
real_df['tld'] = real_df['Name'].map(lambda x: x.split(".")[-1])

In [311]:
real_df['dash_count'] = real_df['Name'].map(lambda x: len(x.split('-')) - 1)

In [312]:
real_df['tld_len'] = real_df['tld'].map(lambda x: len(x))

In [313]:
real_df["2grams"] = real_df['Name'].map(lambda x: max(0, len(x) - 2 + 1))

In [314]:
real_df["3grams"] = real_df['Name'].map(lambda x: max(0, len(x) - 3 + 1))

In [315]:
real_df

Unnamed: 0,Source,Destination,Length,Name,Name Length,tld,dash_count,tld_len,2grams,3grams
835,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
836,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
837,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
840,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
841,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
...,...,...,...,...,...,...,...,...,...,...
7572877,10.10.15.53,224.0.0.251,73,w-me-10.local,13,local,2,5,12,11
7574410,10.10.15.53,224.0.0.251,67,w-me-10,7,w-me-10,2,7,6,5
7605614,fe80::6e30:c1ca:8206:6a95,ff02::fb,105,_microsoft_mcc._tcp.local,25,local,0,5,24,23
7607600,10.10.15.31,224.0.0.251,85,_microsoft_mcc._tcp.local,25,local,0,5,24,23


In [316]:
real_df = real_df.rename(columns={"Name Length": "length"})

In [317]:
real_df

Unnamed: 0,Source,Destination,Length,Name,length,tld,dash_count,tld_len,2grams,3grams
835,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
836,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
837,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
840,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
841,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7
...,...,...,...,...,...,...,...,...,...,...
7572877,10.10.15.53,224.0.0.251,73,w-me-10.local,13,local,2,5,12,11
7574410,10.10.15.53,224.0.0.251,67,w-me-10,7,w-me-10,2,7,6,5
7605614,fe80::6e30:c1ca:8206:6a95,ff02::fb,105,_microsoft_mcc._tcp.local,25,local,0,5,24,23
7607600,10.10.15.31,224.0.0.251,85,_microsoft_mcc._tcp.local,25,local,0,5,24,23


In [318]:
model

<catboost.core.CatBoostClassifier at 0x2d03d4cdfd0>

In [319]:
X = real_df.drop(["Source", "Destination", "Length", "Name"], axis = 1)

In [320]:
X

Unnamed: 0,length,tld,dash_count,tld_len,2grams,3grams
835,9.0,ru,0,2,8,7
836,9.0,ru,0,2,8,7
837,9.0,ru,0,2,8,7
840,9.0,ru,0,2,8,7
841,9.0,ru,0,2,8,7
...,...,...,...,...,...,...
7572877,13,local,2,5,12,11
7574410,7,w-me-10,2,7,6,5
7605614,25,local,0,5,24,23
7607600,25,local,0,5,24,23


In [321]:
test_data_ = Pool(X, cat_features = ['tld'])

In [322]:
model.predict(test_data_)

array([0, 0, 0, ..., 1, 1, 1], shape=(3204094,))

In [323]:
real_df["is_dga"] = model.predict(test_data_)

In [326]:
real_df[real_df["is_dga"] == 0]

Unnamed: 0,Source,Destination,Length,Name,length,tld,dash_count,tld_len,2grams,3grams,is_dga
835,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7,0
836,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7,0
837,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7,0
840,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7,0
841,192.168.200.90,37.221.193.103,73,grafin.ru,9.0,ru,0,2,8,7,0
...,...,...,...,...,...,...,...,...,...,...,...
7506179,10.10.16.1,10.10.15.56,122,dual-spov-0006.spov-msedge.net,30,net,3,3,29,28,0
7506342,10.10.15.56,10.10.16.1,91,my.microsoftpersonalcontent.com,31,com,0,3,30,29,0
7506365,10.10.15.56,10.10.16.1,90,dual-spov-0006.spov-msedge.net,30,net,3,3,29,28,0
7558806,10.10.15.233,224.0.0.251,95,PRO-223T._companion-link._tcp.local,35,local,2,5,34,33,0


In [327]:
df_to_save = real_df.drop(["Length", "length", "tld", "dash_count", "tld_len", "2grams", "3grams"], axis = 1)

In [328]:
df_to_save

Unnamed: 0,Source,Destination,Name,is_dga
835,192.168.200.90,37.221.193.103,grafin.ru,0
836,192.168.200.90,37.221.193.103,grafin.ru,0
837,192.168.200.90,37.221.193.103,grafin.ru,0
840,192.168.200.90,37.221.193.103,grafin.ru,0
841,192.168.200.90,37.221.193.103,grafin.ru,0
...,...,...,...,...
7572877,10.10.15.53,224.0.0.251,w-me-10.local,1
7574410,10.10.15.53,224.0.0.251,w-me-10,1
7605614,fe80::6e30:c1ca:8206:6a95,ff02::fb,_microsoft_mcc._tcp.local,1
7607600,10.10.15.31,224.0.0.251,_microsoft_mcc._tcp.local,1


In [330]:
df_to_save.to_csv("./solution.csv")