# Resources

Algorithm
* [PCA with Text](https://github.com/silvernine209/nyc19_ds20/blob/master/curriculum/project-04/svd-pca/PCA_with_text_ex.ipynb)

Scraping
* [Scrape Reddit Using API](https://towardsdatascience.com/scraping-reddit-data-1c0af3040768)  
* [Google Search Operators](https://ahrefs.com/blog/google-advanced-search-operators/)

API
* [CryptoCompare](https://www.cryptocompare.com)

MongoDB & AWS
* [Allow Python to connect to MongoDB on AWS](https://github.com/silvernine209/nyc19_ds20/blob/master/curriculum/project-04/mongodb-prep/python_to_aws_mongo_setup.md) 
* [MongoDB Exercises](https://github.com/silvernine209/nyc19_ds20/tree/master/curriculum/project-04/mongodb-lab)

Cryptocurrency
* [Bitcoin Transaction Time](https://themoneymongers.com/bitcoin-transaction-time/) ~ 10 min  
* [Crypto Sentiment Analysis Guide #1](https://hackernoon.com/sentiment-analysis-in-cryptocurrency-9abb40005d15)
* [Bitcoin Graph](https://www.coindesk.com/price/bitcoin)

Additional Support
* [NLP Resources](https://github.com/stepthom/text_mining_resources)  
* [Practitioner's NLP Guide](https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72)
* [Metis Project 4 Folder](https://github.com/silvernine209/nyc19_ds20/tree/master/curriculum/project-04)  
* [Udacity Project Customer Segmentation](https://github.com/silvernine209/Udacity-Projects/blob/master/Segment%20Customers/customer_segments.ipynb)

Same as PCA : LSA, SVD, 

# Import Libraries

In [181]:
# Web Scraping
from bs4 import BeautifulSoup
from lxml import html
import requests


# NLP
from contractions import CONTRACTION_MAP
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import spacy
import en_core_web_md
# !spacy download en_core_web_md
# !python -m spacy link /Users/matthewlee/Desktop/Metis/bitcoin_trader/en_core_web_md-2.0.0/en_core_web_md en_core
#nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)
nlp = en_core_web_md.load(parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

# Tools
import string
from datetime import date,timedelta,datetime
import unicodedata
import time
import re
import os
import pandas as pd
import numpy as np
import pickle
import random
from tqdm import tqdm, trange

%matplotlib inline  


# Functions

#### Helper Functions

In [234]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
# Load all scraped pickle files and combine it to one dataframe
def combine_pickle_files(folder_name): 
    # Location of git folder
    git_folder_location = os.path.abspath(os.path.dirname('bitcoin_trader'))

    # list of pickled files
    pickle_list = os.listdir(git_folder_location+'/'+folder_name+'/')
    if '.DS_Store' in pickle_list:
        pickle_list.remove('.DS_Store')

    # Create a DataFrame to dump all individual DataFrames from scraped data
    with open(folder_name+'/'+pickle_list[0], 'rb') as picklefile: 
        df = pickle.load(picklefile)    
    df_merged = pd.DataFrame(columns=df.keys())

    for file in pickle_list:
        with open(folder_name+'/'+file, 'rb') as picklefile: 
            df = pickle.load(picklefile)
        df_merged = pd.concat([df_merged,df],ignore_index=True,axis=0)
    return df_merged

# HTML tags
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

# example) do not -> don't. I would -> I'd
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# standardized into ASCII characters. example) converting é to e
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text 

# Special characters and symbols
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

# JUMPS, JUMPED, and JUMPING -> JUMP
# Multiple Stemmers : PorterStemmer, LancasterStemmer, SnowballStemmer WordNetLemmatizer
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text


def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# # Remove punctuation
# re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)

# # Lower Case
# clean_text = clean_text.lower()

# # Removes all words containing digits
# clean_text = re.sub('\w*\d\w*', ' ', clean_text)

# # Stop words
# from nltk.corpus import stopwords
# set(stopwords.words('english'))

# # Speech Tagging
# from nltk.tag import pos_tag
# my_text = "James Smith lives in the United States."
# tokens = pos_tag(word_tokenize(my_text))
# print(tokens)
# nltk.help.upenn_tagset()

# # Named Entity
# from nltk.chunk import ne_chunk
# my_text = "James Smith lives in the United States."
# tokens = pos_tag(word_tokenize(my_text)) # this labels each word as a part of speech
# entities = ne_chunk(tokens) # this extracts entities from the list of words
# entities.draw()

# # Compoun Term Extraction
# from nltk.tokenize import MWETokenizer # multi-word expression
# my_text = "You all are the greatest students of all time."
# mwe_tokenizer = MWETokenizer([('You','all'), ('of', 'all', 'time')])
# mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))
# mwe_tokens







def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = doc
            #doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

#### BeautifulSoup Scraping

In [32]:
# Load webpage's url and load it into soup
def load_soup(url):
    user_agent_list = [
       #Chrome
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
        'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
         ]
    

    
    headers = {"User-Agent":random.choice(user_agent_list)}
    response = requests.get(url,headers=headers)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup,response.status_code

# Scrape bitcoin news data
def scrape_bitcoin_news_google(start_date,end_date,num_per_page):
    # Create "news_data" folder if it's not there
    git_folder_location = os.path.abspath(os.path.dirname('bitcoin_trader'))
    if 'news_data' not in os.listdir(git_folder_location):
        !mkdir 'news_data'

    # already scraped (list from what's already saved in the folder)
    done_list = os.listdir(git_folder_location+'/news_data/')
    if '.DS_Store' in done_list:
        done_list.remove('.DS_Store')
    
    # Using trange to have a progress bar to gauge scraping time
    for i in trange((end_date-start_date).days):    
        # Month, day, and year to be embedded into the url
        M=start_date.month
        D=start_date.day
        Y=start_date.year 
        
        # File name to save pickle file and not to scrape if already scraped
        filename = 'google_news_{}_{}_{}.pkl'.format(M,D,Y)
        
        # if file isn't scraped, go ahead and scrape
        if filename not in done_list:
            # Load data
            base_url = 'https://www.google.com/search?q=cryptocurrency+or+bitcoin&num=40&rlz=1C5CHFA_enUS849US849&biw=573&bih=717&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{}%2F{}%2F{}%2Ccd_max%3A{}%2F{}%2F{}&tbm=nws&num={}'
            url= base_url.format(M,D,Y,M,D,Y,num_per_page)
            soup,response_code = load_soup(url)
            if response_code !=200:
                print("Blacklisted...?")
                print(start_date)
                break
            
            # Empty lists for DataFrame
            publishers = []
            titles = []
            intros = []
            dates = []  

            # Append data to list
            publishers += [publisher.text for publisher in soup.find_all('div', attrs = {'class': 'pDavDe RGRr8e'})]
            titles += [title.text for title in soup.find_all('div', attrs = {'class': 'phYMDf nDgy9d'})]
            intros += [intro.text for intro in soup.find_all('div', attrs = {'class': 'eYN3rb'})]
            dates += [start_date]*len(publishers)
            
            # Turn data into DataFrame
            df = pd.DataFrame({'date':dates,'publisher':publishers,'title':titles,'intro':intros})
            if len(df)<1:
                print("Empty df")
                break
            # Pickle scraped data
            with open('news_data/'+filename, 'wb') as picklefile:
                pickle.dump(df, picklefile)
            # Move onto next day
            start_date+=timedelta(days=1)
            
#             # Randomly sleep any time between 10 and 12 seconds
#             time.sleep(random.randint(10,12))
        # If file has been scraped, continue
        else:
            # Move onto next day
            print("{} has already been scraped.".format(filename))
            start_date+=timedelta(days=1)



# Scraping

#### Google News Data

In [66]:
# # Scrape Data by passing in start_date, end_date, num_articles_per_day
# # Pages before Google blacklists an IP : 73,65,116,97,71,94,80,79,85,88,84,66,79,76,96,92.
# scrape_bitcoin_news_google(date(2019, 6, 8),date(2019, 8, 13),40) #date.today()

# # Combine individual day pickle files & pickle it
# df_news_raw = combine_pickle_files('news_data')
# df_news_raw.sort_values(by=['date'],inplace=True)
# df_news_raw.reset_index(drop=True,inplace=True)
# with open('df_news_raw.pkl','wb') as picklefile:
#     pickle.dump(df_news_raw,picklefile)

In [98]:
# Load df_news_raw
with open('df_news_raw.pkl', 'rb') as picklefile: 
    df_news_raw = pickle.load(picklefile) 

In [99]:
# Combine title & intro
df_news_raw['title_intro'] = df_news_raw['title']+'. '+df_news_raw['intro']

# Remove '\n' from corpus
df_news_raw['title_intro'] = df_news_raw['title_intro'].apply(lambda x : x.replace('\n',''))

# Cleaned 'title_intro'
df_news_raw['title_intro_clean']=normalize_corpus(df_news_raw['title_intro'])

#### CountVectorizer & NMF

In [241]:
# Bag of Word Model
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

# Create vectorized matrix with stopword
vectorizer = CountVectorizer(stop_words=stopword_list)
doc_word = vectorizer.fit_transform(df_news_raw['title_intro_clean'])
print("Doc Shape : ",doc_word.shape)
#pd.DataFrame(doc_word.toarray(), index=df_news_raw['title_intro_clean'], columns=vectorizer.get_feature_names()).head()

# Define NMF model
nmf_model = NMF(n_components=25, init='random', random_state=0)
doc_topics = nmf_model.fit_transform(doc_word)

# Check cluster distribution
doc_cluster = doc_topics.argmax(axis = 1)
pd.Series(doc_cluster).value_counts()

Doc Shape :  (24823, 34871)


0     3400
22    2609
6     2444
2     1887
10    1662
16    1304
11    1243
24    1127
5      995
20     924
9      817
19     779
23     625
15     577
13     561
8      556
14     480
18     451
21     414
7      398
12     380
17     328
3      321
1      316
4      225
dtype: int64

In [237]:
t = model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[list(vectorizer.vocabulary_.keys())[e-1] for e in l] for l in t]
#topic_words = [[vectorizer.get_feature_names()[e-1] for e in l] for l in t]
topic_words


[['positions',
  'telaviv',
  'poets',
  'libertarians',
  'shy',
  'socalled',
  'procent'],
 ['weatherproof',
  'drogba',
  'rizzo',
  'segwitx',
  'eretailer',
  'cased',
  'advising'],
 ['cashed',
  'recruiting',
  'individually',
  'studios',
  'cryptoonly',
  'riflettono',
  'inexpensive'],
 ['substantially', 'pada', 'glad', 'insolvency', 'moby', 'boyart', 'balks'],
 ['sights', 'enjoys', 'rizzo', 'toda', 'positions', 'stirs', 'master'],
 ['hollywood',
  'segwitx',
  'bursa',
  'autotrigger',
  'episerver',
  'backward',
  'understand'],
 ['celect',
  'episerver',
  'flies',
  'seleccionar',
  'trapheavy',
  'sink',
  'segwitx'],
 ['perishable',
  'flies',
  'cryptoonly',
  'sanctionshit',
  'posted',
  'disapproval',
  'autotrigger'],
 ['punishments',
  'spooked',
  'pascal',
  'telaviv',
  'beleggingsonderneming',
  'cpy',
  'bigshot'],
 ['conduit',
  'corruption',
  'spooked',
  'cryptoonly',
  'merged',
  'disposal',
  'inbrengen'],
 ['wading',
  'redundant',
  'switched',
  '

#### Tf-idf & NMF

In [243]:
from sklearn.feature_extraction.text import TfidfVectorizer

# new TF-IDF Vectorizer
cv_tfidf = TfidfVectorizer(stop_words=stopword_list)
X_tfidf = cv_tfidf.fit_transform(df_news_raw['title_intro_clean'])
#pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())

# Define NMF model
nmf_model = NMF(n_components=25, init='random', random_state=0)
doc_topics = nmf_model.fit_transform(X_tfidf)

# Check cluster distribution
doc_cluster = doc_topics.argmax(axis = 1)
pd.Series(doc_cluster).value_counts()

0     2541
22    1646
16    1388
5     1323
11    1309
2     1188
20    1176
23    1021
12    1007
10     974
18     967
4      957
15     949
9      932
6      898
19     834
13     818
3      781
1      724
21     652
8      648
14     603
17     603
7      538
24     346
dtype: int64

In [244]:
t = model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[list(cv_tfidf.vocabulary_.keys())[e-1] for e in l] for l in t]
#topic_words = [[vectorizer.get_feature_names()[e-1] for e in l] for l in t]
topic_words

[['positions',
  'telaviv',
  'poets',
  'libertarians',
  'shy',
  'socalled',
  'procent'],
 ['weatherproof',
  'drogba',
  'rizzo',
  'segwitx',
  'eretailer',
  'cased',
  'advising'],
 ['cashed',
  'recruiting',
  'individually',
  'studios',
  'cryptoonly',
  'riflettono',
  'inexpensive'],
 ['substantially', 'pada', 'glad', 'insolvency', 'moby', 'boyart', 'balks'],
 ['sights', 'enjoys', 'rizzo', 'toda', 'positions', 'stirs', 'master'],
 ['hollywood',
  'segwitx',
  'bursa',
  'autotrigger',
  'episerver',
  'backward',
  'understand'],
 ['celect',
  'episerver',
  'flies',
  'seleccionar',
  'trapheavy',
  'sink',
  'segwitx'],
 ['perishable',
  'flies',
  'cryptoonly',
  'sanctionshit',
  'posted',
  'disapproval',
  'autotrigger'],
 ['punishments',
  'spooked',
  'pascal',
  'telaviv',
  'beleggingsonderneming',
  'cpy',
  'bigshot'],
 ['conduit',
  'corruption',
  'spooked',
  'cryptoonly',
  'merged',
  'disposal',
  'inbrengen'],
 ['wading',
  'redundant',
  'switched',
  '