In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm_notebook as tqdm
import regex as re
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
import itertools
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
import pdb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from nltk import bigrams

In [2]:
def cleanup(data_df):
    
    # remove nans
    data_df['has_media'].fillna(0, inplace=True)
    
    # remove urls
    data_df['regtext'] = data_df['text'].apply(lambda t: re.sub(r"http.?://[^\s]+[\s]?", '', t))
    
    # remove numbers
    data_df['regtext'] = data_df['regtext'].apply(lambda t: re.sub(r"\s?[0-9]+\.?[0-9]*", '', t))
    
    # remove special characters
    data_df['regtext'] = data_df['regtext'].apply(lambda t: re.sub(r"[^a-zA-Z0-9]+", ' ', t))
    
    # make lowercase
    data_df['regtext'] = data_df['regtext'].str.lower()
    
    # add user_id to text
    data_df['text'] = data_df.apply(lambda row : row['text'] + ' ' + row['usernameTweet'], axis=1)

In [3]:
def tokenStem(data_df):
    
    stemmer = nltk.PorterStemmer()
    tokenizer = nltk.word_tokenize
    stop_words = set(stopwords.words('english'))

    
    #tokenize
    data_df['regtext'] = data_df['regtext'].apply(lambda t: tokenizer(t))
    
    #stem
    data_df['regtext'] = data_df['regtext'].apply(lambda t: [stemmer.stem(word) for word in t if word not in stop_words])

In [4]:
def genExtrafeatures(data_df):    
    
    def count_occurences(character, word_array):
        counter = 0
        for j, word in enumerate(word_array):
            for char in word:
                if char == character:
                    counter += 1
        return counter
    
    # split text
    data_df['split'] = data_df['text'].apply(lambda t: t.split(" "))
    
    # number of $
    data_df['nbr_dol'] = data_df['split'].apply(lambda t: count_occurences('$', t))
    
    # number of :
    data_df['nbr_col'] = data_df['split'].apply(lambda t: count_occurences(':', t))
    
    # number of -
    data_df['nbr_hy'] = data_df['split'].apply(lambda t: count_occurences('-', t))
    
    # number of %
    data_df['nbr_per'] = data_df['split'].apply(lambda t: count_occurences('%', t))
    
    # number of characters
    data_df['nbr_chars'] = data_df['text'].apply(lambda t: len(t))
    
    # number of words in tweet
    data_df['nbr_words'] = data_df['split'].apply(lambda t: len(t))
    
    # number of tokens
    data_df['nbr_tokens'] = data_df['regtext'].apply(lambda t: len(t))
    
    # tok - words
    data_df['wordlesstok'] = data_df['nbr_words'] - data_df['nbr_tokens']
    
    # number of hashtags
    data_df['nbr_tags'] = data_df['split'].apply(lambda t: count_occurences('#', t))
    
    # number of !
    data_df['nbr_ex'] = data_df['split'].apply(lambda t: count_occurences('!', t))
    
    # number of ?
    data_df['nbr_q'] = data_df['split'].apply(lambda t: count_occurences('', t))
    
    # number of mentions
    data_df['nbr_mentions'] = data_df['split'].apply(lambda t: count_occurences('@', t))
    
    # number of urls
    data_df['has_url'] = data_df['text'].str.contains('http')


In [5]:
## Load Data

path = 'Data/Cleaned/xrp_cleaned_date.pkl'
randata_df = pd.read_pickle(path)

path = 'Data/Raw/goodUserCrypto.pkl'
gooddata_df = pd.read_pickle(path)

cleanup(gooddata_df)
cleanup(randata_df)
tokenStem(gooddata_df)
tokenStem(randata_df)
genExtrafeatures(gooddata_df)
genExtrafeatures(randata_df)

In [6]:
## Create Spam List

spamWords = ['ico', 'freetoken', 'token', 'airdrop', 'airdrops', 'rippleprice_', 'bigpumpgroup', 'bounty', 'usd', 'korea price', 'binance', 'current price', \
            'cryptopricexrp', 'cryptobot', 'coinstats', 'coinpricenow', 'cryptogulp', 'ripplebot1h', 'bigdata', 'RippleMarket','CryptoGulp', 'ripplebot_cs', \
            'cryptopricebot','VirtualMoneyBot','coinstats','JustFactsNL', 'free', 'trx', 'XrpTicker', 'aWebAnalysis', 'scotlarock727', '1bitcoinkaclira', \
            'pureinvestments']
spamWordsCon = ("|").join(spamWords)

## Label Spam
randata_df['Spam'] = randata_df['text'].str.contains(spamWordsCon)
gooddata_df['Spam'] = False

In [7]:
randata_df['Spam'].sum()

12014

In [8]:
## Make training list

training_df = gooddata_df
training_df = training_df.append(randata_df[randata_df['Spam'] == True])

In [9]:
training_df.head()

Unnamed: 0,ID,Spam,datetime,has_media,has_url,is_reply,is_retweet,medias,nbr_chars,nbr_col,...,nbr_tokens,nbr_words,regtext,rounded_dateTime,split,text,url,user_id,usernameTweet,wordlesstok
0,8.794407e+17,False,2017-06-26 16:46:13,0.0,False,1.0,0.0,,147,0,...,10,28,"[lost, one, nick, think, term, creation, troub...",NaT,"[Lost, me, on, that, one,, Nick, , \nThink, it...","Lost me on that one, Nick \nThink it's the te...",/PJSanderson/status/879440684624871424,168649848,PJSanderson,18
1,9.830839e+17,False,2018-04-08 16:47:12,0.0,False,1.0,0.0,,215,0,...,12,35,"[freedom, speech, protect, infring, govern, in...",NaT,"[Not, only, that,, but, freedom, of, speech, i...","Not only that, but freedom of speech is only p...",/SeamusWalsh2/status/983083861763473408,343861762,SeamusWalsh2,23
2,9.895482e+17,False,2018-04-26 12:54:16,1.0,True,0.0,0.0,[https://t.co/ra4K4ln6yx],194,1,...,20,28,"[fantast, video, archiv, coinscrum, minicon, w...",NaT,"[A, Fantastic, Video, , From, , The, Archives,...",A Fantastic Video From The Archives - Coinsc...,/coinscrum/status/989548224171167744,1196956998,coinscrum,8
3,9.626423e+17,False,2018-02-11 05:59:37,0.0,True,0.0,0.0,,345,2,...,28,55,"[everi, morn, read, post, elit, trader, list, ...",NaT,"[Every, morning, I, read, through, the, posts,...",Every morning I read through the posts from ...,/whatbitcoindid/status/962642274596335616,893818632089763840,whatbitcoindid,27
4,9.915655e+17,False,2018-05-02 02:30:13,0.0,False,0.0,0.0,,230,0,...,19,39,"[ask, idl, suffer, eat, sand, crow, vultur, ho...",NaT,"[""Ask, them, why, they, idle, there\nWhile, we...","""Ask them why they idle there\nWhile we suffer...",/poetrypotion/status/991565507051577344,62152913,poetrypotion,20


In [10]:
def reg2dict(row):
    out = {}
    
    for w in row['regtext']:
        out[w] = 1
        
#     for b in bigrams(row['regtext']):
#         out[b[0]+b[1]] = 1
        
    out['has_media'] = row['has_media']
    out['has_url'] = row['has_url']
    out['is_reply'] = row['is_reply']
    out['wordlesstok'] = row['wordlesstok']
    out['is_retweet'] = row['is_retweet']
    out['nbr_chars'] = row['nbr_chars']
    out['nbr_ex'] = row['nbr_ex']
    out['nbr_favorite'] = row['nbr_favorite']
    out['nbr_mentions'] = row['nbr_mentions']
    out['nbr_q'] = row['nbr_q']
    out['nbr_reply'] = row['nbr_reply']
    out['nbr_retweet'] = row['nbr_retweet']
    out['nbr_tags'] = row['nbr_tags']
    out['nbr_tokens'] = row['nbr_tokens']
    out['nbr_words'] = row['nbr_words']    
    out['nbr_dol'] = row['nbr_dol']
    out['nbr_per'] = row['nbr_per']
    out['nbr_hy'] = row['nbr_hy']
    out['nbr_col'] = row['nbr_col']
    
    return out

In [11]:
X_data = []

for index, row in training_df.iterrows():
    X_data.append(reg2dict(row))

In [12]:
## dictVectorize

vec = DictVectorizer()

X_data_v = vec.fit_transform(X_data).toarray()

In [13]:
Y_data = training_df['Spam'].tolist()

fig = plt.figure()

labels = ['Spam', 'Not Spam']
i = 0
for v in [True, False]:
    subset = plot_df[plot_df['Pred'] == v]
    sns.distplot(subset['Pval'], label = str(v))
    i += 1
    
plt.legend()
fig.savefig('ExpandedFeatureSpamClassifier')

In [18]:
RFC = RandomForestClassifier(max_depth=7)
RFC.fit(X_data_v, Y_data)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
## Load Data

path = 'Data/Cleaned/processed_data.pkl'
tweets_df = pd.read_pickle(path)

cleanup(tweets_df)
tokenStem(tweets_df)
genExtrafeatures(tweets_df)

X_tweets = []

for index, row in tweets_df.iterrows():
    X_tweets.append(reg2dict(row))
    
X_tweets_v = vec.transform(X_tweets)

In [19]:
Y_pred = RFC.predict(X_tweets_v)
Pval = RFC.predict_proba(X_tweets_v)[:,0]

In [20]:
sum(Y_pred)

28957

In [21]:
tweets_df['Ypred'] = Y_pred
tweets_df['trustworthiness'] = Pval.tolist()

In [22]:
tweets_df.head()

Unnamed: 0,ID,datetime,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,...,nbr_words,nbr_tokens,wordlesstok,nbr_tags,nbr_ex,nbr_q,nbr_mentions,has_url,Ypred,trustworthiness
0,9.885302e+17,2018-04-23 17:29:08,0.0,0.0,0.0,0.0,0.0,New post (Could Ripple XRP Really Reach $1...,/ExcelVisibility/status/988530235493748736,815322320,...,27,17,10,0,0,0,0,True,False,0.50669
1,9.866292e+17,2018-04-18 11:35:08,0.0,0.0,0.0,0.0,0.0,Long/Short Bitcoin swings with up to 100x Leve...,/KerriHermanx/status/986629205856223233,956303409065414656,...,95,38,57,0,1,0,0,True,True,0.20111
2,9.847927e+17,2018-04-13 09:57:41,0.0,0.0,0.0,0.0,0.0,"Did you know that you can buy, sell, store, co...",/coindirectcom/status/984792745666580481,915453520924545025,...,53,26,27,2,0,0,1,True,False,0.605244
3,9.8638e+17,2018-04-17 19:05:02,0.0,0.0,0.0,0.0,0.0,1 Ripple = 0.6589 USD. Ripple has changed ...,/RippleMarket/status/986380040517701633,848989770923601922,...,31,14,17,3,0,0,0,True,False,0.546682
4,9.866047e+17,2018-04-18 09:57:42,0.0,0.0,0.0,0.0,0.0,How Is #Ripple Different From All Other #Cryp...,/Matthix191/status/986604686856933376,916641996823031808,...,91,35,56,8,0,0,0,True,True,0.14576


In [23]:
tweets_df.to_pickle('Data/Cleaned/Filtered_tweets_df.pkl')

In [25]:
## View spam some tweets

for i in range(100):
    if Y_pred[i] == True:
        print(tweets_df['text'][i])
        print('')
        print('')

Long/Short Bitcoin swings with up to 100x Leverage at BitMEX!

10% Fee Discount using the link below  

→  http:// bitmex.com/register/m9MNxm   

 $ LSK   $ BCH   $ DGB   $ AION   $ EMC   $ RDD   $ SALT   $ OMG   $ ARK   $ EOS   $ MAID   $ WTC   $ QTUM   $ XRP   $ LTC   $ ELF   $ ZEC   $ BAT pic.twitter.com/IFj1YxJk0O KerriHermanx


How Is #Ripple  Different From All Other #Cryptocurrencies ? An Ultimate Guide  https://www. reddit.com/r/Ripple/comme nts/8d58hr/how_is_ripple_different_from_all_other/   … 
#ripple  #tron   $ trx   $ xrp   $ eth   $ btc   $ xmr   $ ltc  #litecoin   $ etc   $ eos   $ neo   $ xlm   $ ada  #cardano   $ nem   $ iota   $ lsk   $ icx  #cryptonews  #redbux Matthix191


immense Paid group now free for week  https:// goo.gl/PwS8QF   

  $ CGE   $ HEAT   $ SLS  17.23   $ POP   $ POE   $ BYC   $ SNGLS   $ NEX   $ XRP   $ XEM   $ SAFEX   $ PLBT   $ CLAM   $ TKN   $ SC   $ WAVES   $ TKY   $ CLUB   $ NMR   $ NAV   $ NXS  
   5F4bFhK926z3Tsnh4tn3k6Fz HyhhDx1ubatwX52


#

In [26]:
## View good some tweets

for i in range(100):
    if Y_pred[i] == False:
        print(tweets_df['text'][i])
        print('')
        print('')

New post (Could  Ripple   XRP  Really Reach $10.00 In 39 Days?) has been published on  -  https:// masscryptocurrency.com/?p=2664   pic.twitter.com/6Dlog2TNQE ExcelVisibility


Did you know that you can buy, sell, store, convert and transfer @Ripple  #XRP  on  http:// Coindirect.com       in the UK, EU countries, Australia, Kenya, Nigeria and South Africa. You can use fiat currency or convert other coins and exchange them for #Ripple  . coindirectcom


1  Ripple  = 0.6589 USD.  Ripple  has changed by -0.0026 USD in 30 mins. Live price:  https:// is.gd/hI9OcA    #ripple  #xrp  #cryptocurrency RippleMarket


Drinking a Revive by @UplandBrewCo @HopCat - Broad  Ripple  —  http:// untp.beer/s/c585116066   Baileybones32


this man fighting for his durag that’s not even doing its job , i dont see one  ripple   https:// twitter.com/ajplus/status/ 986157817919815680   … heyyitscallie


Walmart And Moneygram Partnership Could Pump  Ripple  ( XRP ) @EtherWorldNews https:// ethereumworldnews.com/w