In [41]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from regularize import regularize_urls, regularize_numbers
from tqdm import tqdm_notebook as tqdm
import regex as re
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
## Load Data
path = 'Data/Cleaned/xrp_cleaned_date.pkl'

data_df = pd.read_pickle(path)

In [3]:
data_df.head()

Unnamed: 0,ID,datetime,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet,has_media,medias,rounded_dateTime
0,9.885302e+17,2018-04-23 17:29:08,0.0,0.0,0.0,0.0,0.0,New post (Could Ripple XRP Really Reach $1...,/ExcelVisibility/status/988530235493748736,815322320,ExcelVisibility,,,2018-04-23 18:00:00
1,9.847927e+17,2018-04-13 09:57:41,0.0,0.0,0.0,0.0,0.0,"Did you know that you can buy, sell, store, co...",/coindirectcom/status/984792745666580481,915453520924545025,coindirectcom,1.0,[https://t.co/h1CTJHGwPs],2018-04-13 10:00:00
2,9.8638e+17,2018-04-17 19:05:02,0.0,0.0,0.0,0.0,0.0,1 Ripple = 0.6589 USD. Ripple has changed ...,/RippleMarket/status/986380040517701633,848989770923601922,RippleMarket,,,2018-04-17 20:00:00
3,9.85341e+17,2018-04-14 22:16:19,0.0,0.0,0.0,0.0,1.0,# Batteries #ITO #ICO #Token #TokenSale #...,/nsdelpitiya/status/985341014339608576,880003387672088576,nsdelpitiya,,,2018-04-14 23:00:00
4,9.866047e+17,2018-04-18 09:57:42,0.0,0.0,0.0,0.0,0.0,How Is #Ripple Different From All Other #Cryp...,/Matthix191/status/986604686856933376,916641996823031808,Matthix191,1.0,[https://t.co/xF5l49lz8P],2018-04-18 10:00:00


In [4]:
# URLS
data_df['text'] = data_df['text'].apply(lambda t : re.sub(r'^https?:\/\/.*[\r\n]*', 'URL', t))

#Mentions
data_df['text'] = data_df['text'].apply(lambda t : re.sub(r'@[A-Za-z0-9]+','@',t))

# Hashtags
# data_df['text'] = data_df['text'].apply(lambda t : re.sub("#[^a-zA-Z]", "#", t))
data_df['text'] = data_df['text'].apply(lambda t : t.replace("#", " "))

#lowercase
data_df['text'] = data_df['text'].str.lower()

In [5]:
data_df.head()

Unnamed: 0,ID,datetime,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet,has_media,medias,rounded_dateTime
0,9.885302e+17,2018-04-23 17:29:08,0.0,0.0,0.0,0.0,0.0,new post (could ripple xrp really reach $1...,/ExcelVisibility/status/988530235493748736,815322320,ExcelVisibility,,,2018-04-23 18:00:00
1,9.847927e+17,2018-04-13 09:57:41,0.0,0.0,0.0,0.0,0.0,"did you know that you can buy, sell, store, co...",/coindirectcom/status/984792745666580481,915453520924545025,coindirectcom,1.0,[https://t.co/h1CTJHGwPs],2018-04-13 10:00:00
2,9.8638e+17,2018-04-17 19:05:02,0.0,0.0,0.0,0.0,0.0,1 ripple = 0.6589 usd. ripple has changed ...,/RippleMarket/status/986380040517701633,848989770923601922,RippleMarket,,,2018-04-17 20:00:00
3,9.85341e+17,2018-04-14 22:16:19,0.0,0.0,0.0,0.0,1.0,batteries ito ico token tokensale ...,/nsdelpitiya/status/985341014339608576,880003387672088576,nsdelpitiya,,,2018-04-14 23:00:00
4,9.866047e+17,2018-04-18 09:57:42,0.0,0.0,0.0,0.0,0.0,how is ripple different from all other cryp...,/Matthix191/status/986604686856933376,916641996823031808,Matthix191,1.0,[https://t.co/xF5l49lz8P],2018-04-18 10:00:00


In [6]:
## Find most common words

def top_kwords(data_df, vec, k):
    X = vec.fit_transform(data_df['text'].values)
    labels = vec.get_feature_names()
    
    return pd.DataFrame(columns = labels, data = X.toarray()).sum().sort_values(ascending = False)[:k]

In [7]:
vec = CountVectorizer(decode_error = 'ignore', stop_words = 'english')
X = vec.fit_transform(data_df['text'].values)

In [8]:
top_words = top_kwords(data_df, vec, 40)

top_words

xrp               31442
ripple            29917
com               16686
https             12397
btc               10323
bitcoin            9770
eth                9360
twitter            8240
crypto             8140
price              7120
trx                6543
binance            6398
ethereum           6315
pic                6234
airdrop            5743
blockchain         5624
cryptocurrency     5274
http               4935
tron               4479
ltc                4442
freetoken          4368
usd                3601
krw                3126
litecoin           3125
bounty             3112
token              2954
04                 2920
news               2636
xlm                2621
xvg                2596
2018               2586
neo                2373
eos                2283
ada                2256
altcoin            2157
www                2105
ico                2087
airdrops           2062
bch                1992
pump               1882
dtype: int64

In [9]:
top_words.head()

xrp       31442
ripple    29917
com       16686
https     12397
btc       10323
dtype: int64

In [10]:
data_df[data_df['text'].str.contains('freetoken')].head()

Unnamed: 0,ID,datetime,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet,has_media,medias,rounded_dateTime
27,9.883367e+17,2018-04-23 04:40:13,0.0,0.0,1.0,0.0,0.0,thugcoin thugday airdrop freetoken b...,/PRASHAN52758622/status/988336731253456896,955332600561786880,PRASHAN52758622,,,2018-04-23 05:00:00
69,9.85552e+17,2018-04-15 12:14:36,0.0,0.0,2.0,1.0,1.0,https: //docs.google.com/forms/d/e/1faipqlsepd...,/Galihpurnama252/status/985551978490032128,973942382679896065,Galihpurnama252,,,2018-04-15 13:00:00
70,9.879789e+17,2018-04-22 04:58:26,0.0,0.0,0.0,0.0,0.0,stockchain (scc) final airdrop 500 scc bonus d...,/gean_star/status/987978925685948416,122683379,gean_star,1.0,[],2018-04-22 05:00:00
76,9.884479e+17,2018-04-23 12:01:49,0.0,0.0,0.0,0.0,0.0,streamycoin (500 sc) airdrop token (250 sc tok...,/cic8ino/status/988447860323741697,2421155851,cic8ino,1.0,[https://t.co/fivOmlvu1l],2018-04-23 13:00:00
93,9.854561e+17,2018-04-15 05:53:28,1.0,0.0,0.0,0.0,0.0,swapchange (500 sc) airdrop token (250 sc toke...,/jooodidz1/status/985456062114533377,977277680218460162,jooodidz1,1.0,[],2018-04-15 06:00:00


In [11]:
## Create Training List

spamWords = ['ico', 'freetoken', 'token', 'airdrop', 'airdrops', 'bigpumpgroup', 'bounty']
spamWordsCon = ("|").join(spamWords)
posSpamWords = ['pump', 'binance']

In [12]:
## Label Spam
data_df['Spam'] = data_df['text'].str.contains(spamWordsCon)

In [13]:
data_df.head(25)

Unnamed: 0,ID,datetime,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet,has_media,medias,rounded_dateTime,Spam
0,9.885302e+17,2018-04-23 17:29:08,0.0,0.0,0.0,0.0,0.0,new post (could ripple xrp really reach $1...,/ExcelVisibility/status/988530235493748736,815322320,ExcelVisibility,,,2018-04-23 18:00:00,False
1,9.847927e+17,2018-04-13 09:57:41,0.0,0.0,0.0,0.0,0.0,"did you know that you can buy, sell, store, co...",/coindirectcom/status/984792745666580481,915453520924545025,coindirectcom,1.0,[https://t.co/h1CTJHGwPs],2018-04-13 10:00:00,False
2,9.8638e+17,2018-04-17 19:05:02,0.0,0.0,0.0,0.0,0.0,1 ripple = 0.6589 usd. ripple has changed ...,/RippleMarket/status/986380040517701633,848989770923601922,RippleMarket,,,2018-04-17 20:00:00,False
3,9.85341e+17,2018-04-14 22:16:19,0.0,0.0,0.0,0.0,1.0,batteries ito ico token tokensale ...,/nsdelpitiya/status/985341014339608576,880003387672088576,nsdelpitiya,,,2018-04-14 23:00:00,True
4,9.866047e+17,2018-04-18 09:57:42,0.0,0.0,0.0,0.0,0.0,how is ripple different from all other cryp...,/Matthix191/status/986604686856933376,916641996823031808,Matthix191,1.0,[https://t.co/xF5l49lz8P],2018-04-18 10:00:00,False
5,9.852602e+17,2018-04-14 16:55:20,0.0,0.0,0.0,0.0,0.0,walmart and moneygram partnership could pump ...,/vandecrypto/status/985260236238934018,943489871758405632,vandecrypto,1.0,[https://t.co/LxwR2chVOy],2018-04-14 17:00:00,False
6,9.860333e+17,2018-04-16 20:07:13,1.0,0.0,0.0,1.0,0.0,this is not fake news. look at our references....,/sniperstube/status/986033300136316928,921966736303362048,sniperstube,,,2018-04-16 21:00:00,False
7,9.865902e+17,2018-04-18 09:00:00,0.0,0.0,1.0,0.0,0.0,top 5 cryptocurrencies - current prices\n\n ...,/CryptoGulp/status/986590166381887488,945716745192574977,CryptoGulp,,,2018-04-18 10:00:00,False
8,9.873616e+17,2018-04-20 12:05:20,0.0,0.0,1.0,0.0,0.0,ripple price alert. the last ask price for ...,/ripplebot_cs/status/987361585122480128,944577421008875521,ripplebot_cs,,,2018-04-20 13:00:00,False
9,9.851089e+17,2018-04-14 06:54:01,0.0,0.0,1.0,0.0,0.0,xrp &amp; ripple: the world’s financial infra...,/btc_current/status/985108911312093184,952620879443320832,btc_current,1.0,[https://t.co/dKr58pbYhj],2018-04-14 07:00:00,False


In [14]:
# Generate Training Set
training_df = pd.DataFrame(columns = ['text', 'label'])
training_df['text'] = data_df['text']
training_df['label'] = data_df['Spam']

In [15]:
data_df['Spam'][0]

False

In [32]:
## preprocess data

def preProcess(X, Y, method = 'BBoW'):
    
    print('Pre-processing files...')
    
    # create lemmatizer
    wnl = WordNetLemmatizer()
    
    # create stopWords
    stopWords = set(stopwords.words('english'))
    
    # initialize output variables
    vocab = {'1'}
    vocab.remove('1')
    data_x = []
    data_y = []
    N = 0 #number of documents
    doc_count = {}
    
    # for each file..
    for i in range(len(X)):
        
        label = Y[i]
        data = X[i]
            
        hold = re.compile(r'\b\w{1,}\b', re.IGNORECASE)
        words = hold.findall(data)
        # tokenize words with punctuation that would not be found with the above regex search
        hold2 = re.compile(r'\b\w{1,}[.!?]', re.IGNORECASE)
        words2 = hold2.findall(data)
        words.extend(words2)
                
        ## treat data
        doc_dict = {}
        N = N + 1
        for word in words:
            
            # lemmatize
            word = wnl.lemmatize(word)
                
            if word not in stopWords:
                vocab.add(word)

                if method == 'BBoW':
                    doc_dict[word] = 1

                if (method == 'CCoW') or (method == 'TF-IDF'):
                    if word in doc_dict:
                        doc_dict[word] = doc_dict[word] + 1
                    else:
                        doc_dict[word] = 1
                        if method == 'TF-IDF':
                            if word in doc_count:
                                doc_count[word] = doc_count[word] + 1
                            else:
                                doc_count[word] = 1
            
        #if doc_dict:
        data_x.append(doc_dict)
        data_y.append(label)
    
    # generate idfs for each wordss
    if method == 'TF-IDF':
        for i in range(len(data_x)):
            for key in data_x[i]:
                idf = np.log10(N / doc_count[key])
                data_x[i][key] = data_x[i][key] * idf
            
    print('Done')
    return data_x, data_y, vocab

In [17]:
def genPofY(labels):
    # count the number of labels present to generate PofY distribution
    
    PofY = {}
    N = len(labels)
    for label in labels:
        if label in PofY:
            PofY[label] = PofY[label] + 1
        else:
            PofY[label] = 1
            
    for key in PofY:
        PofY[key] = PofY[key]/N
    
    return PofY

In [18]:
def train(label_list, data_x, data_y, k, vocab):
    # train model

    prob_dist = {}
    denom = {}
    
    # initialize prob_dist & denom for each label
    for label in label_list:
        prob_dist[label] = {}
        denom[label] = 0
     
    # create running sum of words present in each doc per label
    for doc in range(len(data_x)):
        for word in data_x[doc]:
            label = data_y[doc]
            val = data_x[doc][word]
            if word in prob_dist[label]:
                prob_dist[label][word] += val + k
            else:
                prob_dist[label][word] = val + k
            denom[label] += val
    
    # divide each word in each label by the number of words present
    for label in prob_dist:
        denom[label] += len(vocab)*k
        for word in vocab:
            if word in prob_dist[label]:
                prob_dist[label][word] = np.log10(prob_dist[label][word] / denom[label])
            else:
                # if word in vocab is not present in the label use laplace smoothing to create an arbitrarily low 
                # probability
                prob_dist[label][word] = np.log10(k / denom[label])
        
    return prob_dist

In [19]:
def test(data_x_test, data_y_test, label_list, prob_dist):
    # test predictions against actual labels and return accuracy
    
    correct = 0.0;
    cnt = 0
    for doc in range(len(data_x_test)):
        label_prob = []
        for label in label_list:
            hold = 0
            for word in data_x_test[doc]:
                if word in prob_dist[label]:
                    hold += prob_dist[label][word]
            hold += np.log10(PofY_train[label])
            label_prob.append(hold) 
        label_guess = label_list[np.argmax(label_prob)]
        if label_guess == data_y_test[doc]:
            correct += 1.0
        cnt += 1
        
    acc = correct / cnt
    return acc

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(np.asarray(data_df['text']), np.asarray(data_df['Spam']), test_size = 0.2)

In [21]:
X_train[1]

'because we are already mainstreem !  the year when financial institutions comes in ! 2018 is  xrp  year ! mark my world ! we all know that ! is evident that   xrp  as is fast ,secure big network  ripplenet  over 100+ banks , @ ! + @  @  so on ...'

In [22]:
## Verify Naive Bayes

# pre-process data with each word representation scheme, train, and test 
data_x_train, data_y_train, vocab_train = preProcess(X_train, Y_train, method = 'TF-IDF')

data_x_test, data_y_test, vocab_test = preProcess(X_test, Y_test, method = 'TF-IDF')

# generate P of Y
PofY_train = genPofY(data_y_train)

# make label list
label_list = []
for label in PofY_train:
    label_list.append(label)

# laplace smoothing constant
k = .1

# train 
prob_dist = train(label_list, data_x_train, data_y_train, k, vocab_train)

# Test Accuracy
test_acc = test(data_x_test, data_y_test, label_list, prob_dist)

# Train Accuracy
train_acc = test(data_x_train, data_y_train, label_list, prob_dist)
    
print('Test Accuracy: ')
print(test_acc)
print('Train Accuracy: ')
print(train_acc)

Pre-processing files...
Done
Pre-processing files...
Done
Test Accuracy: 
0.9620601533320228
Train Accuracy: 
0.981274880817811


In [23]:
# add probability that a tweet is spam

def predict(model, UX):
    # predict UY for a given model and UX

    # Prediction confidences
    PY = []
    UY = []
    
    # for each doc generate label probabilites and select the label with the highest probability
    for doc in range(len(UX)):
        label_prob = []
        for label in label_list:
            hold = 0
            for word in UX[doc]:
                if word in model[label]:
                    hold += model[label][word]
            hold += np.log10(PofY_train[label])
            label_prob.append(hold) 
            
        # append Confidence and prediction sets
        PY.append(np.max(label_prob))
        UY.append(label_list[np.argmax(label_prob)])
        
    return UY, PY

In [34]:
len(data_df['Spam'])

25435

In [35]:
len(data_df['text'])

25435

In [37]:
len(data_x)

25435

In [33]:
# pre-process data with each word representation scheme, train, and test 
data_x, data_y, vocab = preProcess(np.asarray(data_df['text']), np.asarray(data_df['Spam']), method = 'TF-IDF')

# generate P of Y
PofY_train = genPofY(data_y)

# make label list
label_list = []
for label in PofY_train:
    label_list.append(label)

# laplace smoothing constant
k = .1

# train 
prob_dist = train(label_list, data_x, data_y, k, vocab)

UY, PY = predict(prob_dist, data_x)

Pre-processing files...
Done


In [29]:
len(UY)

25434

In [27]:
data_df['Spam'].head()

0    False
1    False
2    False
3     True
4    False
Name: Spam, dtype: bool

In [38]:
## View false positives
c = 0;
for i in range(len(data_df['Spam'])):
    if (UY[i] == True) and (data_df['Spam'][i] == False):
        c += 1
        # print(data_df['text'][i])
        # print('/n')
        
c

338

In [40]:
print(classification_report(data_df['Spam'], UY))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99     20193
       True       0.94      0.97      0.95      5242

avg / total       0.98      0.98      0.98     25435



In [42]:
confusion_matrix(data_df['Spam'], UY)

array([[19855,   338],
       [  147,  5095]], dtype=int64)