In [1]:
from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
from collections import Counter
import json
from itertools import chain
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
from itertools import chain
from gensim.corpora import Dictionary
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

In [5]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as SKLEARN_STOPWORDS
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from nltk.corpus import stopwords

SKLEARN_STOPWORDS = set(SKLEARN_STOPWORDS)
nltk.download('stopwords')
NLTK_STOPWORDS = set(stopwords.words('english'))
print(f'Loaded {len(NLTK_STOPWORDS)} stopwords from NLTK')
print(f'Loaded {len(SPACY_STOPWORDS)} stopwords from SPACY')
print(f'Loaded {len(SKLEARN_STOPWORDS)} stopwords from SKLEARN')
stop_words = list(set.union(*[SKLEARN_STOPWORDS, SPACY_STOPWORDS, NLTK_STOPWORDS]))
print('----------------------------------')
print(f'{len(stop_words)} combined stopwords')

Loaded 179 stopwords from NLTK
Loaded 326 stopwords from SPACY
Loaded 318 stopwords from SKLEARN
----------------------------------
409 combined stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
%%time
df = pd.read_parquet('../data/base.parquet')

CPU times: user 11.6 s, sys: 3.44 s, total: 15.1 s
Wall time: 10.8 s


In [7]:
tokenizer = TweetTokenizer()

In [8]:
%%time
corpus = df['body'].values
corpus = [doc.lower() for doc in corpus]
corpus = [tokenizer.tokenize(doc) for doc in corpus]
corpus = [[token for token in doc if (not token.isnumeric() and len(token) > 1)] for doc in corpus]
corpus= [[token for token in doc if (not token in stop_words)] for doc in corpus]
bigram = Phrases(corpus, min_count=1)
trigram = Phrases(bigram[corpus], min_count=1)
fourgram = Phrases(trigram[corpus], min_count=1)
bigrams = []
trigrams = []
fourgrams = []
for doc in corpus:
    b = [b for b in bigram[doc] if b.count('_') == 1]
    t = [t for t in trigram[bigram[doc]] if t.count('_') == 2]
    f = [f for f in fourgram[trigram[bigram[doc]]] if f.count('_') == 3]
    bigrams.extend(b)
    trigrams.extend(t)
    fourgrams.extend(f)

CPU times: user 1h 56min 2s, sys: 14.7 s, total: 1h 56min 16s
Wall time: 1h 56min 17s


In [9]:
bigram_counter = Counter(bigrams)
trigram_counter = Counter(trigrams)
fourgram_counter = Counter(fourgrams)

In [18]:
bigram_counter.most_common(20)

[('fullz_info', 672389),
 ('visa_master', 517715),
 ('western_union', 420247),
 ('gold_platinum', 314109),
 ('bin_dob', 276858),
 ('visa_classic', 229860),
 ('amex_dis', 190681),
 ('amex_discover', 184850),
 ('credit_card', 169005),
 ('balance_gbp', 153908),
 ('bank_login', 146700),
 ('corporate_signature', 143236),
 ('paypal_veritified', 139010),
 ('mtcn_sender', 125489),
 ('track_pin', 107227),
 ('contact_yahoo', 104594),
 ('accept_payment', 99325),
 ('ship_laptop', 94285),
 ('ship_iphone', 93259),
 ('acc_paypal', 90141)]

In [17]:
trigram_counter.most_common(20)

[('sell_paypal_veritified', 126008),
 ('bin_dob_fullz', 116384),
 ('visa_gold_platinum', 110676),
 ('visa_classic_visa', 77455),
 ('sell_paypal_pass', 74596),
 ('gold_platinum_business', 72967),
 ('corporate_signature_business', 65393),
 ('cc_fresh_hight', 53526),
 ('good_customers_long-term', 52721),
 ('http://abraxasdegupusel.onion/register/ot9av77ebg_http://abraxasdegupusel.onion/register/ot9av77ebg_http://abraxasdegupusel.onion/register/ot9av77ebg',
  45901),
 ('information_western_union', 45222),
 ('good_dont_work', 42105),
 ('business_mastercard_gold', 40441),
 ('balance_software_bug', 38891),
 ('info_payment_fee', 37756),
 ('dumps_track_1/2', 37707),
 ('balance_sample_track', 37200),
 ('usa_visa_classic', 36596),
 ('buy_spam_hate', 36201),
 ('accept_payment_money', 33665)]

In [16]:
fourgram_counter.most_common(20)

[('___', 164669),
 ('visa_master_amex_discover', 149491),
 ('gold_platinum_corporate_signature', 77836),
 ('bin_dob_fullz_info', 71859),
 ('visa_master_amex_dis', 70066),
 ('balance_gbp_balance_gbp', 63399),
 ('mtcn_sender_country_sender', 53070),
 ('dumps_tracks_dumps_tracks', 46240),
 ('referral_link_pleasure_http://abraxasdegupusel.onion/register/ot9av77ebg',
  46052),
 ('signature_purchase_corporate_world', 40434),
 ('spam_ripper_want_spam', 35456),
 ('countries_mastercard_visa_classic', 33879),
 ('visa_classic_mastercard_standart', 33148),
 ('asia_australia_exotic_mastercard', 32890),
 ('random_bin_vbv_dob', 30156),
 ('1acc_selling_acc_paypal', 30084),
 ('mastercard_gold_platinum_american', 30070),
 ('express_sid_discover_canada', 29873),
 ('thread_hope_better_customer', 28320),
 ('ireland_new_zeland_switherland', 27657)]

In [12]:
with open('../data/bigrams.json', 'w') as f:
    json.dump(dict(bigram_counter), f)

In [13]:
with open('../data/trigrams.json', 'w') as f:
    json.dump(dict(trigram_counter), f)

In [14]:
with open('../data/fourgrams.json', 'w') as f:
    json.dump(dict(fourgram_counter), f)