In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm_notebook as tqdm
import regex as re
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
import itertools
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
import pdb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [25]:
def cleanup(data_df):
    
    # remove nans
    data_df['has_media'].fillna(0, inplace=True)
    
    # remove urls
    data_df['regtext'] = data_df['text'].apply(lambda t: re.sub(r"http.?://[^\s]+[\s]?", '', t))
    
    # remove numbers
    data_df['regtext'] = data_df['regtext'].apply(lambda t: re.sub(r"\s?[0-9]+\.?[0-9]*", '', t))
    
    # remove special characters
    data_df['regtext'] = data_df['regtext'].apply(lambda t: re.sub(r"[^a-zA-Z0-9]+", ' ', t))
    
    # make lowercase
    data_df['regtext'] = data_df['regtext'].str.lower()
    
    # add user_id to text
    data_df['text'] = data_df.apply(lambda row : row['text'] + ' ' + row['usernameTweet'], axis=1)

In [6]:
def tokenStem(data_df):
    
    stemmer = nltk.PorterStemmer()
    tokenizer = nltk.word_tokenize
    stop_words = set(stopwords.words('english'))

    
    #tokenize
    data_df['regtext'] = data_df['regtext'].apply(lambda t: tokenizer(t))
    
    #stem
    data_df['regtext'] = data_df['regtext'].apply(lambda t: [stemmer.stem(word) for word in t if word not in stop_words])

In [7]:
def genExtrafeatures(data_df):    
    
    def count_occurences(character, word_array):
        counter = 0
        for j, word in enumerate(word_array):
            for char in word:
                if char == character:
                    counter += 1
        return counter
    
    # split text
    data_df['split'] = data_df['text'].apply(lambda t: t.split(" "))
    
    # number of characters
    data_df['nbr_chars'] = data_df['text'].apply(lambda t: len(t))
    
    # number of words in tweet
    data_df['nbr_words'] = data_df['split'].apply(lambda t: len(t))
    
    # number of tokens
    data_df['nbr_tokens'] = data_df['regtext'].apply(lambda t: len(t))
    
    # tok - words
    data_df['wordlesstok'] = data_df['nbr_words'] - data_df['nbr_tokens']
    
    # number of hashtags
    data_df['nbr_tags'] = data_df['split'].apply(lambda t: count_occurences('#', t))
    
    # number of !
    data_df['nbr_ex'] = data_df['split'].apply(lambda t: count_occurences('!', t))
    
    # number of ?
    data_df['nbr_q'] = data_df['split'].apply(lambda t: count_occurences('', t))
    
    # number of mentions
    data_df['nbr_mentions'] = data_df['split'].apply(lambda t: count_occurences('@', t))
    
    # number of urls
    data_df['has_url'] = data_df['text'].str.contains('http')
    


In [26]:
## Load Data

path = 'Data/Cleaned/xrp_cleaned_date.pkl'
randata_df = pd.read_pickle(path)

path = 'Data/Raw/goodUserCrypto.pkl'
gooddata_df = pd.read_pickle(path)

cleanup(gooddata_df)
cleanup(randata_df)
tokenStem(gooddata_df)
tokenStem(randata_df)
genExtrafeatures(gooddata_df)
genExtrafeatures(randata_df)

In [27]:
## Create Spam List

spamWords = ['ico', 'freetoken', 'token', 'airdrop', 'airdrops', 'rippleprice_', 'bigpumpgroup', 'bounty', 'usd', 'korea price', 'binance', 'current price', \
            'cryptopricexrp', 'cryptobot', 'coinstats', 'coinpricenow', 'cryptogulp', 'ripplebot1h', 'bigdata', 'RippleMarket','CryptoGulp', 'ripplebot_cs', \
            'cryptopricebot','VirtualMoneyBot','coinstats','JustFactsNL', 'free', 'trx', 'XrpTicker', 'aWebAnalysis', 'scotlarock727', '1bitcoinkaclira', \
            'pureinvestments']
spamWordsCon = ("|").join(spamWords)

## Label Spam
randata_df['Spam'] = randata_df['text'].str.contains(spamWordsCon)
gooddata_df['Spam'] = False

In [28]:
randata_df['Spam'].sum()

12014

In [29]:
## Make training list

training_df = gooddata_df
training_df = training_df.append(randata_df[randata_df['Spam'] == True])

In [30]:
training_df.head()

Unnamed: 0,ID,Spam,datetime,has_media,has_url,is_reply,is_retweet,medias,nbr_chars,nbr_ex,...,nbr_tokens,nbr_words,regtext,rounded_dateTime,split,text,url,user_id,usernameTweet,wordlesstok
0,8.794407e+17,False,2017-06-26 16:46:13,0.0,False,1.0,0.0,,147,0,...,10,28,"[lost, one, nick, think, term, creation, troub...",NaT,"[Lost, me, on, that, one,, Nick, , \nThink, it...","Lost me on that one, Nick \nThink it's the te...",/PJSanderson/status/879440684624871424,168649848,PJSanderson,18
1,9.830839e+17,False,2018-04-08 16:47:12,0.0,False,1.0,0.0,,215,0,...,12,35,"[freedom, speech, protect, infring, govern, in...",NaT,"[Not, only, that,, but, freedom, of, speech, i...","Not only that, but freedom of speech is only p...",/SeamusWalsh2/status/983083861763473408,343861762,SeamusWalsh2,23
2,9.895482e+17,False,2018-04-26 12:54:16,1.0,True,0.0,0.0,[https://t.co/ra4K4ln6yx],194,0,...,20,28,"[fantast, video, archiv, coinscrum, minicon, w...",NaT,"[A, Fantastic, Video, , From, , The, Archives,...",A Fantastic Video From The Archives - Coinsc...,/coinscrum/status/989548224171167744,1196956998,coinscrum,8
3,9.626423e+17,False,2018-02-11 05:59:37,0.0,True,0.0,0.0,,345,0,...,28,55,"[everi, morn, read, post, elit, trader, list, ...",NaT,"[Every, morning, I, read, through, the, posts,...",Every morning I read through the posts from ...,/whatbitcoindid/status/962642274596335616,893818632089763840,whatbitcoindid,27
4,9.915655e+17,False,2018-05-02 02:30:13,0.0,False,0.0,0.0,,230,0,...,19,39,"[ask, idl, suffer, eat, sand, crow, vultur, ho...",NaT,"[""Ask, them, why, they, idle, there\nWhile, we...","""Ask them why they idle there\nWhile we suffer...",/poetrypotion/status/991565507051577344,62152913,poetrypotion,20


In [14]:
def reg2dict(row):
    out = {}
    
    for w in row['regtext']:
        out[w] = 1
        
    out['has_media'] = row['has_media']
    out['has_url'] = row['has_url']
    out['is_reply'] = row['is_reply']
    out['wordlesstok'] = row['wordlesstok']
    out['is_retweet'] = row['is_retweet']
    out['nbr_chars'] = row['nbr_chars']
    out['nbr_ex'] = row['nbr_ex']
    out['nbr_favorite'] = row['nbr_favorite']
    out['nbr_mentions'] = row['nbr_mentions']
    out['nbr_q'] = row['nbr_q']
    out['nbr_reply'] = row['nbr_reply']
    out['nbr_retweet'] = row['nbr_retweet']
    out['nbr_tags'] = row['nbr_tags']
    out['nbr_tokens'] = row['nbr_tokens']
    out['nbr_words'] = row['nbr_words']    
    
    return out

In [31]:
X_data = []

for index, row in training_df.iterrows():
    X_data.append(reg2dict(row))

In [32]:
## dictVectorize

vec = DictVectorizer()

X_data_v = vec.fit_transform(X_data).toarray()

In [33]:
Y_data = training_df['Spam'].tolist()

fig = plt.figure()

labels = ['Spam', 'Not Spam']
i = 0
for v in [True, False]:
    subset = plot_df[plot_df['Pred'] == v]
    sns.distplot(subset['Pval'], label = str(v))
    i += 1
    
plt.legend()
fig.savefig('ExpandedFeatureSpamClassifier')

In [34]:
RFC = RandomForestClassifier(max_depth=7)
RFC.fit(X_data_v, Y_data)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [37]:
## Load Data

path = 'goodUserCrypto.pkl'
tweets_df = pd.read_pickle(path)

cleanup(tweets_df)
tokenStem(tweets_df)
genExtrafeatures(tweets_df)

X_tweets = []

for index, row in tweets_df.iterrows():
    X_tweets.append(reg2dict(row))
    
X_tweets_v = vec.transform(X_tweets)

In [38]:
Y_pred = RFC.predict(X_tweets_v)
Pval = RFC.predict_log_proba(X_tweets_v)

In [48]:
tweets_df['Ypred'] = Y_pred
tweets_df['Pval'] = Pval.tolist()

In [54]:
tweets_df.to_pickle('Filtered_tweets_df')

In [56]:
## View some tweets

for i in range(1000):
    if Y_pred[i] == True:
        print(tweets_df['text'][i])

immese Pid group ow free for week  hps: goo.glPwS8QF   

  $ CGE   $ HEAT   $ SLS  17.23   $ POP   $ POE   $ BYC   $ SNGLS   $ NEX   $ XRP   $ XEM   $ SAFEX   $ PLBT   $ CLAM   $ TKN   $ SC   $ WAVES   $ TKY   $ CLUB   $ NMR   $ NAV   $ NXS  
   5F4bFhK926z3Tsh43k6Fz HyhhDx1ubatwX52
«1 club». Now we opeed our doors for 1 people   hps: goo.glLTMrN   

  $ NAV   $ SAFEX   $ XRP   $ POE   $ CLAM   $ HEAT  17.23   $ CGE   $ SLS   $ BYC   $ TKN   $ SC   $ CLUB   $ NEX   $ SNGLS   $ TKY   $ POP   $ NXS   $ WAVES   $ PLBT   $ XEM   $ NMR  
   kQSsDzZrFd8Ei22y7SZB w26WcNEE9kRF80B
Crypos s of 9.53m GMT+1:

(Bifie)

#Bicoi  (BTCUSD) +3.74%
#Ehereum  (ETHUSD) +6.96%
#Liecoi  (LTCUSD) +5.7%
#Moero  (XMRUSD) +3.29%
#Ripple  ( XRP USD) +4.2%

#Crypocurrecies IC_Markets
Lookig for hoes d profible rdig clls, recommedios d dvice, joi:

 hp: .meCrypoHesig ls   … 

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ XLM   $ XMR   $ ZEC   $ ADA   $ SYS   $ NEO   $ LUX   $ OMG   $ POWR   $ VTC   $ 

 $ BTC   $ NPXS   $ AUC   $ EVE   $ MAN   $ TFD   $ TOMO   $ SHIP   $ TAU   $ BAX   $ BEE   $ AION   $ VEE   $ AURA   $ TRAC   $ FSN   $ TRX   $ EOS   $ ETH   $ XVG   $ STORM   $ ADA   $ BCC   $ XLM   $ ICX   $ NEO   $ IOTA   $ SEN   $ XDCE   $ XRP   $ XML   $ HPB   $ ARDR   $ SYS fornit4
Lookig for hoes d profible rdig clls, recommedios d dvice, joi:

 hp: .meCrypoHesig ls   … 

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ XLM   $ XMR   $ ZEC   $ ADA   $ SYS   $ NEO   $ LUX   $ OMG   $ POWR   $ VTC   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ VEN   $ ICX   $ ZCL   $ DRGN   $ WTC   $ IC   $ TRX   $ QTUM   $ LSK   $ ZRX  65331 ChrisJordanTwit
Lookig for hoes d profible rdig clls, recommedios d dvice, joi:

 hp: .meCrypoHesig ls   … 

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ XLM   $ XMR   $ ZEC   $ ADA   $ SYS   $ NEO   $ LUX   $ OMG   $ POWR   $ VTC   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ VEN   $ ICX   $ ZCL   $ DRGN   $ WTC   $ IC   $ TRX   $ QTUM

 $ UGAZ   $ UWT   $ SVXY   $ USLV   $ JNUG   $ LABU   $ SOXL   $ MJ   $ BLOK   $ YINN   $ BRZU   $ VIX   $ XIV   $ F   $ GE   $ SQQQ   $ FB   $ X   $ C   $ DGAZ   $ BA   $ XBI   $ DWT   $ UVXY   $ SPY   $ JDST   $ LABD   $ BTC   $ TSLA   $ AAPL   $ ETH   $ LTC   $ XRP   $ GBTC  #HODL  #BTFD   $ TVIX   $ GLD   $ GDX   $ UNG   $ USO   $ VXX   $ IBB   $ TLT   $ SPX pic.wier.com44rJChbqT mark_hruska
73  Hi guys! Look  my sigl cel!
  hp: .mecryposigl   
 $ MAN   $ SC   $ POE   $ GUP   $ DNT   $ WAX   $ BTC   $ OCT   $ RCN   $ BNB   $ VTC   $ VIBE   $ ARDR   $ ZCL   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ BTG   $ XLM   $ XMR   $ ZEC   $ ADA   $ NEO   $ LUX   $ MED   $ POWR   $ LUX   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ GRS   $ SC StepBeen
1 sizble coi which will give you 5% check here —>  hps: goo.gl181L68   

  $ NXS  17.23   $ SC   $ NEX   $ XRP   $ SLS   $ CLUB   $ NAV   $ POE   $ CLAM   $ NMR   $ WAVES   $ XEM   $ SAFEX   $ HEAT   $ POP   $ TKY   $ CGE   $ SNGLS   $ TKN 

   ADFZG2Qi4Tf325yfiy69frr lpww0kjEyUWs05M
There would eed o be $1M USD equivle vlue peso sell o he XRP-Peso sell order o he books of he echge.  If o he he purchse would drive up he vlue of he peso relive o  XRP .  I m jus ryig o use logic. infcabuffalo
Free chrome eesio o hide wees h over-shill crypo csh gs & promoe groups
  hp:www. shillkill.com   

No more csh g spm like show below:

 $ BTC   $ ETH   $ XRP   $ BCH   $ EOS   $ LTC   $ ADA   $ XLM   $ MIOTA   $ NEO   $ XMR   $ DASH   $ TRX   $ XEM   $ USDT  

711 pic.wier.comfZmSTmUo shillkill_fan
Upcoi Echge will go live Mody! mke sure o grb your erly ivie, ge up o $5 i rdig fees d ge redy o sr rdig!!
 hp:www. Upcoi.Rocks   

 $ BTC   $ ETH   $ XRP   $ BCH   $ LTC   $ NEO   $ XLM   $ ADA   $ XMR   $ EOS   $ DASH   $ IOTA   $ NEM   $ TRX   $ ETC   $ VEN   $ LISK   $ QTUM   $ BTG   $ NANO   $ OMG   $ ZEC pic.wier.comFOO8dwv ClevelandCrypto
Lookig for hoes d profible rdig clls, recommedios d dvice, joi:

 hp: .meCrypoHesig ls   … 

 $ B

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ BTG   $ XLM   $ XMR   $ ZEC   $ ADA   $ SNT   $ NEO   $ NXT   $ OMG   $ POWR   $ VTC   $ VOX   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ GRS   $ AMP   $ ZCL   $ DRGN   $ DCR  4179 johnsongreytwe1
Lookig for hoes d profible rdig clls, recommedios d dvice, joi:

 hp: .meCrypoHesig ls   … 

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ XLM   $ XMR   $ ZEC   $ ADA   $ SYS   $ NEO   $ LUX   $ OMG   $ POWR   $ VTC   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ VEN   $ ICX   $ ZCL   $ DRGN   $ WTC   $ IC   $ TRX   $ QTUM   $ LSK   $ ZRX  9865 cryptomike77
All  $ CLAM  Trges Achieved ->  hps: bi.ly2qsNs   

  $ WTC   $ SNGLS   $ ICOS   $ PLBT   $ CLAM   $ SAFEX    $ BLZ   $ NMR   $ OMNI   $ HEAT   $ POP   $ UFO   $ APPC   $ AMM   $ NEX   $ BIFI   $ TKY   $ BCH   $ XRP   $ TKN  
  h6b8AkfBE65NBAseGsG39DGE levcho2
Crypo echge Bilish dds suppor for  Ripple  ( XRP ) d NEM (XEM)  hp: bime.sieposs75158   grachaorfnaydy1
his che

#ICOs  | #XVG  | #XRP   pic.wier.com3mAdizQY8H CameronLyman3
Top12 from big cois: #XRP  4.27%, #XEM  1.67%, #EOS  1.65%, #ADA  1.62%, #XLM  1.57%, #LTC  1.19%, #BCH  1.8%, #ETH  .99%, #NEO  .88%, #BTC  .48%, #MIOTA  .32%, #XMR  .4% pic.wier.com6XLYdNC5m CoinoMonitor
You c mke 5 i 1 moh, jus look  hese gre chel ->  hps: goo.glrLhWFb   

  $ TKY   $ TKN   $ CGE   $ SAFEX   $ BYC   $ SLS   $ SC   $ PLBT   $ HEAT   $ NMR   $ CLUB   $ WAVES   $ NAV   $ XEM   $ XRP   $ SNGLS  17.23   $ NXS   $ NEX   $ POE   $ POP   $ CLAM  
   2QbsDzND7d8DZ6SN9Asde lpww0kjEyUWs05M
his chel clled  $ DGB  before i mde 6. hey oly shre high rewrd, low risk sigls. joi:

 hp: .meCrypoHesig ls   … 

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ BTG   $ XLM   $ XMR   $ ZEC   $ ADA   $ SNT   $ NEO   $ NXT   $ OMG   $ POWR   $ VTC   $ LUX   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ GRS   $ AMP   $ ZCL   $ DRGN   $ DCR  7646 jasonclarktwit
his chel clled  $ DGB  before i mde 6. hey oly shre high rewrd, l

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ XLM   $ XMR   $ ZEC   $ ADA   $ SYS   $ NEO   $ LUX   $ OMG   $ POWR   $ VTC   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ VEN   $ ICX   $ ZCL   $ DRGN   $ WTC   $ IC   $ TRX   $ QTUM   $ LSK   $ ZRX  4694 ChrisJordanTwit
This guy give immese sigls for free.  $ PRL  did 26% i oe dy. check —>  hps: goo.glPX5H5y   

  $ WAVES   $ TKY   $ XEM   $ SLS   $ CGE   $ BYC   $ XRP   $ TKN   $ HEAT   $ POE   $ CLAM   $ SAFEX   $ NAV   $ PLBT  17.23   $ POP   $ CLUB   $ SC   $ SNGLS   $ NMR   $ NXS   $ NEX  
   5ZYSTe8bS9Z3yz4fsDQeQ3 HyhhDx1ubatwX52
his chel clled  $ DGB  before i mde 6. hey oly shre high rewrd, low risk sigls. joi:

 hp: .meCrypoHesig ls   … 

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ BTG   $ XLM   $ XMR   $ ZEC   $ ADA   $ SNT   $ NEO   $ MED   $ OMG   $ POWR   $ VTC   $ SC   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ GRS   $ AMP   $ ZCL   $ DRGN   $ DCR  52383 irisblacktweets
Our  $ CLUB  sigl brod did

 $ BTC   $ ETH   $ ETC   $ BCH   $ LTC   $ XRP   $ DASH   $ XLM   $ XMR   $ ZEC   $ ADA   $ SYS   $ NEO   $ LUX   $ OMG   $ POWR   $ VTC   $ XEM   $ LSK   $ DGB   $ DOGE   $ XVG   $ VEN   $ ICX   $ ZCL   $ DRGN   $ WTC   $ IC   $ TRX   $ QTUM   $ LSK   $ ZRX  941 cryptomike77
5 for mid-erm,  $ WGR   sigls ->  hps: bi.ly2ENLTX2   

  $ CGE   $ INFX   $ NAV   $ TKS   $ SLS   $ ECC   $ MEME   $ WAVES   $ NXS   $ XRP   $ BYC   $ CLUB   $ HSR   $ POE   $ EDG   $ XEM   $ ARDR   $ WGR    $ AMB   $ SC   $ BTC   $ PKB  
  SZ47ThNf5Zs3Zf4Nzkee9S9E AnzhelaChashch2
I gve  $ NAV  mouious sigl  d i did 8% i hours. check here —>  hps: goo.glCBA5T   

  $ TKN   $ SAFEX   $ NEX   $ XRP   $ HEAT   $ NMR   $ CLAM   $ NXS   $ SC   $ BYC   $ POP   $ SLS  17.23   $ WAVES   $ NAV   $ PLBT   $ TKY   $ POE   $ CGE   $ CLUB   $ XEM   $ SNGLS  
   rT6rREE727K2hDDb2kN7Y ZQIEk1dDPDKGyO3
Th's wh I sid - bu o April 12h ;)

 XRP  he = .55 USD - ow .96 USD

EOS he = 8.9 USD - ow 15.5 USD

Sill go he December feelig...