In [66]:
import pandas as pd
import json
import functools
import re
import spacy
from spacy.tokens import Doc
import itertools as it
from nltk.tokenize import TweetTokenizer
from spacy.lang.en import STOP_WORDS
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaModel

import pyLDAvis
import pyLDAvis.gensim
import pickle
import codecs
from spacy.pipeline import Pipe
import warnings
warnings.filterwarnings('ignore')

In [67]:
'''
Source: https://stackoverflow.com/questions/21708192/how-do-i-use-the-json-module-to-read-in-one-json-object-at-a-time/21709058
Author: unutbu
Purpose: Read in 1 JSON object at a time into a list
'''
def json_parse(fileobj, decoder=json.JSONDecoder(), buffersize=2048, 
               delimiters=None):
    remainder = ''
    for chunk in iter(functools.partial(fileobj.read, buffersize), ''):
        remainder += chunk
        while remainder:
            try:
                stripped = remainder.strip(delimiters)
                result, index = decoder.raw_decode(stripped)
                yield result
                remainder = stripped[index:]
            except ValueError:
                # Not enough data to decode, read more
                break

In [68]:
with open('../data/tweets.txt', 'r') as rf:
    json_list = list(json_parse(rf))

In [69]:
len(json_list)

2000

In [70]:
for i in range(30, 35):
    json = json_list[i]
    try:
        text = json["retweeted_status"]["extended_tweet"]["full_text"]
    except KeyError:
        text = json["text"]
    print(text)
    print()

12652.916 USD  https://t.co/3T5IP5YsEn | #btc #bitcoin #blockchain #cryptocurrency https://t.co/eB0Ji7u78N

Something to consider if you suffer from #fomo over bitcoin: you might make or lose money by investing now, but you won’t lose anything by NOT investing. 

Few great quotes in this article: https://t.co/S910SsTFDX

12660.50 USD  https://t.co/vmknxeCRiL | #btc #bitcoin #blockchain #cryptocurrency https://t.co/a186DDXLAU

Lightning Network May Not Solve Bitcoin's Scaling 'Trilemma' https://t.co/umtLQyur31 #bitcoin #blockchain Via… https://t.co/tCpYmvbDJL

MAKE MONEY WITH bitcoin!!!

✅KUCOIN is 100% OPEN!!!

CLICK ▶️ https://t.co/lPgWGdMR6i

❌Binance 
❌Poloniex 
❌Bittre… https://t.co/YjUgq2T3mC



In [71]:
nlp = spacy.load('en')

In [72]:
ttkzr = TweetTokenizer(strip_handles=True, reduce_len=True)

#### The first step in our preprocessing is to remove urls and tokenize our tweets

In [73]:
docs = []
for json in json_list:
    try:
        text = json["retweeted_status"]["extended_tweet"]["full_text"]
    except KeyError:
        text = json["text"]
    text_nourl = re.sub(r"http\S+", "", text)
    tokens = ttkzr.tokenize(text_nourl)
    doc = Doc(nlp.vocab, words=tokens)
    docs.append(doc)

In [74]:
docs[90]

JOIN THE LARGEST #BITCOIN #CRYPTO NETWORK ON EARTH ! EMAIL : cryptopennystock@gmail.com #CryptoCurrencyGod #TEAMBILLIONAIRE #PENNYSTOCKS #STOCKALERT #BTC , #ETH , #ADA #XRP #TRX #LTC #BLOCKCHAIN 

In [75]:
docs_filepath = '../data/docs.txt'

In [76]:
with open(docs_filepath, 'w', encoding='utf_8') as wf:
    for doc in docs:
        wf.write(doc.text)
        wf.write("\n")

#### Now we lemmatize our tokens and remove punctuation

In [77]:
'''
Source: Modern NLP in Python
Author: Patrick Harrison
Purpose: Remove punctuations and lemmatize our tokens.
'''
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [78]:
unigram_sentences_filepath = '../data/unigram_sentences.txt'

In [79]:
with open(unigram_sentences_filepath, 'w', encoding='utf-8') as f:
    for sentence in lemmatized_sentence_corpus(docs_filepath):
        f.write(sentence + '\n')

In [80]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [81]:
for sentence in it.islice(unigram_sentences, 230, 240):
    print(sentence)

['join', '-PRON-', 'ico', 'now']
['here', 'be', 'a', 'link', 'and', '-PRON-', 'know', 'what', 'to', 'do', 'next', '😉']
['become', 'a', 'part', 'of', 'first', 'ecocryptomin', 'in', 'the', 'world']
['invest', 'investment', 'innovate', 'innovation', 'mining', 'miningfarm', 'crypto', 'profit', 'bitcoin', 'ethereum', 'cryptocurrency']
['st']
['james', 'place', 'condos', 'in', 'columbia', 'heights', 'accept', 'bitcoin', 'as', 'payment', 'washington', 'business', 'journal']
['rt']
['-PRON-', 'wish', 'man', 'would', 'put', 'as', 'much', 'confidence', 'in', 'woman', 'as', '-PRON-', 'do', 'in', 'bitcoin']
['there', 'be', 'a', 'direct', 'correlation', 'between', 'the', 'growth', 'in', 'the', 'number', 'of', 'successful', 'and', 'reliable', 'project', 'and', 'the', 'arrival', 'of', 'new', 'investor', 'on', 'the', 'market', 'whose', 'interest', 'will', 'increase', 'significantly']
['rr', 'revizorcoin', 'cryptocurrency', 'ico', 'blockchain']


#### As people sometimes use the ticker symbol for a coin instead of its commonly used name, we need to go through our tweets and replace all instances of the ticker symbol with its commonly used name

In [82]:
crypto_names = pd.read_csv('../data/crypto-markets.csv')

In [83]:
crypto_names = crypto_names[['slug', 'symbol']]
crypto_names.drop_duplicates(inplace=True)

In [84]:
crypto_names['symbol'] = crypto_names['symbol'].str.lower()
crypto_names['slug'] = crypto_names['slug'].str.replace('-', '_')
crypto_names.head()

Unnamed: 0,slug,symbol
0,bitcoin,btc
1719,ethereum,eth
2607,ripple,xrp
4228,bitcoin_cash,bch
4400,cardano,ada


#### We use a dictionary to perform quick lookups and swaps

In [85]:
crypto_dict = dict(zip(crypto_names['symbol'].values, crypto_names['slug'].values))

In [87]:
unigram_sentences_replaced = []
for i, sentence in enumerate(unigram_sentences):
    for j, word in enumerate(sentence):
        key = word
        if key in crypto_dict:
            sentence[j] = crypto_dict[key]
    unigram_sentences_replaced.append(sentence)

In [88]:
unigram_sentences = unigram_sentences_replaced

#### Now we remove stop words from our tweets

In [89]:
sentences_nostop = []
STOP_WORDS.add("rt")
for i, sentence in enumerate(unigram_sentences):
    sentence_nostop = [token for token in sentence if token not in STOP_WORDS]
    sentences_nostop.append(sentence_nostop)

In [90]:
unigram_sentences = sentences_nostop

In [91]:
for sentence in it.islice(unigram_sentences, 230, 240):
    print(sentence)

['join', '-PRON-', 'ico']
['chainlink', '-PRON-', 'know', '😉']
['particl', 'ecocryptomin', 'world']
['invest', 'investment', 'innovate', 'innovation', 'mining', 'miningfarm', 'crypto', 'profit', 'bitcoin', 'ethereum', 'cryptocurrency']
['st']
['james', 'place', 'condos', 'columbia', 'heights', 'accept', 'bitcoin', 'payment', 'washington', 'business', 'journal']
[]
['-PRON-', 'mywish', 'man', 'putincoin', 'confidence', 'woman', '-PRON-', 'bitcoin']
['direct', 'correlation', 'growth', 'number', 'successful', 'reliable', 'project', 'arrival', 'new', 'investor', 'market', 'interest', 'increase', 'significantly']
['rr', 'revizorcoin', 'cryptocurrency', 'ico', 'blockchain']


#### Using gensim's Phrases model, we can form bigrams from our raw tokens. For example, ["big", "foot"] would become "big_foot"

In [92]:
bigram_sentences_filepath = '../data/bigram_sentences.txt'

In [93]:
# Trains the detector
bigram_model = Phrases(unigram_sentences)

In [94]:
with open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = bigram_model[unigram_sentence]
        for word in bigram_sentence:
            f.write(word)
            f.write(" ")
        f.write("\n")

In [95]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

##### Finally we prepare a bag of words (BOW) corpus for LDA topic modelling

In [108]:
for sentence in it.islice(bigram_sentences, 20, 30):
    print(sentence)

['otc', 'bitcoin', 'bitcoin', 'crypto', 'blockchain']
['-PRON-', 'new', 'ico']
['check', 'procedure', 'participate', 'trakinvest', 'ico', 'sale']
['participate', '-PRON-', 'presearch_ico', 'sale', 'avail', 'bonus', 'upto', '18', 'cryptocurrency', 'asianico']
['trakinvestico', 'ethereum', 'tokensale', 'nextgenaitools']
['-PRON-', 'need', 'leave', '-PRON-', 'warm', 'bed', 'late', 'cryptocurrency', 'news']
['join', '-PRON-', 'telegram_channel']
['jamie', 'dimon', 'buy', 'bitcoin', '2018']
['shoe', 'satoshi', 'wear']
['find', 'answer', 'bad', 'joke', 'cointelegraph']


In [97]:
dictionary_filepath = '../data/dictionary.dict'

In [98]:
dictionary = Dictionary(bigram_sentences)

In [99]:
dictionary.filter_extremes(no_below=5, no_above=0.5)
dictionary.compactify()
dictionary.save(dictionary_filepath)

In [100]:
def bow_generator(filepath):    
    for tweet in LineSentence(filepath):
        yield dictionary.doc2bow(tweet)

In [101]:
bow_filepath = "../data/bow.mm"

In [102]:
MmCorpus.serialize(bow_filepath, bow_generator(bigram_sentences_filepath))
bow_corpus = MmCorpus(bow_filepath)

In [103]:
bow_corpus[43]

[(8, 1.0), (23, 1.0), (41, 3.0), (55, 2.0), (101, 1.0)]