In [1]:
import pandas as pd

In [2]:
pd.reset_option('display.max_rows')

df = pd.read_csv("./input_data/cert_pl_domains.txt", names=["domain"])
df['tld'] = df['domain'].str.split('.').str[-1]

# Based on manual review of EDA clusters, I decided to filter out "expensive" tld's
df = df[df["tld"] != "com"]
df = df[df["tld"] != "pro"]
df = df[df["tld"] != "net"]
df = df[df["tld"] != "org"]
df = df[df["tld"] != "de"]
df = df[df["tld"] != "eu"]
df = df[df["tld"] != "us"]
df = df[df["tld"] != "app"]
df = df[df["tld"] != "fr"]
df = df[df["tld"] != "au"]
df = df[df["tld"] != "nl"]

# And domains that:
# 1. don't have at least one dash
# 2. or have one dot or less (so less than two parts)
# 3. or are shorter than 6 characters
df = df[
    df["domain"].str.contains("-", na=False) |              # must contain at least one dash
    (df["domain"].str.count(r"\.") > 1) &                  # must have more than one dot
    (df["domain"].str.len() >= 6)                          # must be at least 6 chars long
]

df

Unnamed: 0,domain,tld
0,0-l.x-tut.space,space
3,007d761bc3.nxcli.io,io
8,00e56.subskrypcje.homes,homes
10,015298978-xzczx.sbs,sbs
12,025c4.zaplata.hair,hair
...,...,...
146393,zv2s3rg8m1bdkxw.wsparcie.sbs,sbs
146404,zwrot-podatku.dedyn.io,io
146420,zxjmogav.oirpltv.pl,pl
146434,zyciowa-oaza.pl,pl


In [3]:
pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows')
df['tld'].value_counts().head(2469)

tld
icu         10810
cfd          6808
sbs          5357
shop         4043
top          1902
            ...  
im              1
tokyo           1
delivery        1
nz              1
goog            1
Name: count, Length: 190, dtype: int64

In [4]:
# pd.set_option('display.max_rows', None)
df_com = df[df["domain"].str.endswith(".top", na=False)]
df_com

Unnamed: 0,domain,tld
26,06464fea-a94a-4e1f-b0fa-9c091e041315.gxqu.top,top
59,0lx.zam0wlenle73737.top,top
83,0xlo.sn9ast.top,top
201,18727.newtonschools.top,top
203,18996.newtonschools.top,top
...,...,...
145136,zbadac-ln.top,top
145701,zh.gxqu.top,top
145938,zmagania-2025.top,top
146273,zqxjpo6s7t.invias.top,top


In [5]:
df = df.drop(columns=["tld"])
df

Unnamed: 0,domain
0,0-l.x-tut.space
3,007d761bc3.nxcli.io
8,00e56.subskrypcje.homes
10,015298978-xzczx.sbs
12,025c4.zaplata.hair
...,...
146393,zv2s3rg8m1bdkxw.wsparcie.sbs
146404,zwrot-podatku.dedyn.io
146420,zxjmogav.oirpltv.pl
146434,zyciowa-oaza.pl


In [6]:
# Let's shuffle our set
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,domain
0,alebilet.pl-oferta9381922.cfd
1,stolica-skokow.pl
2,web.telesafe.club
3,pl-279104.icu
4,puvemu-nubaho.pics
...,...
39214,allegro.oferty0292172.sbs
39215,allegrolokalnie.ioferty-0150292.sbs
39216,www.lightweighthotoptimizedsafe.autos
39217,alebilet.oferta-2347012.icu


In [15]:
# And split data for later testing
n = int(len(df) * 0.9)
df_train = df.iloc[:n]  # 90%
df_test = df.iloc[n:]   # 10%


In [16]:
df_train.to_csv("./output_data/filtered_mal_domains.csv", index=False)
df_test.to_csv("./output_data/new_mal_domains.csv", index=False)
