In [1]:
#encoding=utf-8
import re
import nltk
import json
import jieba
# import talib
import pymongo
import requests
import datetime
import operator
import itertools
import numpy as np
import pandas as pd
import networkx as nx
from bs4 import BeautifulSoup
from nltk import FreqDist
from pymongo import MongoClient
from dateutil.relativedelta import relativedelta

# %matplotlib inline
# %pylab inline

In [3]:
# Link to DB
client = MongoClient('mongodb://localhost:27017/')
price_db = client.bitcoinprice_db
price_collection = price_db.CoindeskDailyCollection
news_db = client.newsdata
news_collection = news_db.news

# Text Dictionary
jieba.set_dictionary('data/dict.txt') # 繁體字詞庫
jieba.load_userdict("data/bitcoin.txt") # 自訂字詞庫

# 更改：半形全形轉 punctuation
r1 = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`，。{|}~「」＜＞〈〉《》（）？：、！+*＊]'
r2 = '\n'
r3 = '\xa0'
r4 = ' ' 

Building prefix dict from /home/alex/most-bitcoin/text-mining/data/dict.txt ...
Dumping model to file cache /tmp/jieba.u02c398774850b7cbce1d584aa26674dd.cache
Loading model cost 1.407 seconds.
Prefix dict has been built succesfully.


In [4]:
class PriceDataModify:
    """ 
        class info:
            Get price data from connected price_collection, and then modify the data.
            There should be 1 input to call this class, price_collection.

        init:
            self.price_collection: 
                type: pymongo.collection.Collection
                data: dict with ['Date', 'Open', 'High', 'Low', 'Close', '_id']
            self.price_array:
                type: list
                data: dict with ['Date', 'Open', 'High', 'Low', 'Close', '_id']
            self.price_df:
                type: pandas.core.frame.DataFrame
                data: Date as index, with ['Open', 'High', 'Low', 'Close', 'Datee']
            sefl.price_tag:
                type: pandas.core.frame.DataFrame
                data: Date as index, with 
                    ['Open', 'High', 'Low', 'Close', 'Datee', 'C_back1', 'C_forward1','Tag', 'Diff']
        functions:
            get_price(self):
                use: Get data from self.price_collection, and save into self.price_array
            modify_price(self):
                use: Transfer self.price_array into self.price_df, with data modified.
            mark_price(self):
                use: Mark data with set method.
            get_marked_price(self):
                use: Export tagged data into self.price_tag.
    """
    def __init__(self, price_collection):
        self.price_collection = price_collection

    def get_price(self):
        price_array = []
        for n in self.price_collection.find():
            price_array.append(n)
        self.price_array = price_array
    
    def modify_price(self):
        # from array to df
        price_df =  pd.DataFrame(self.price_array)
        # Re-order
        price_df = price_df[['Date', 'Close']]
        # make a copy col
        price_df['Datee'] = price_df['Date']
        # set datetime as index
        price_df.set_index('Date',inplace=True)
        self.price_df = price_df

    def mark_price(self):
        price_df = self.price_df
        price_df['C_back1'] = price_df['Close'].shift(-1)  # 前一天
        price_df['C_forward1'] = price_df['Close'].shift(1) #後一天價格
        price_df['Tag'] = 0 # 標記漲跌用 
        for index, row in price_df.iterrows():
            diff = row['C_forward1']/row['Close']
            price_df.set_value(index, 'Diff', diff)
            if diff >= 1.2:
                price_df.set_value(index, 'Tag', 1)
                #print('大漲', index, diff)
            elif diff <= 0.8:
                price_df.set_value(index, 'Tag', 2)
                #print('大跌', index,diff)
            else:
                price_df.set_value(index, 'Tag', 0)
        self.price_df = price_df
        
    def get_marked_price(self):
        price_tag = self.price_df[(self.price_df['Tag'] == 1) | (self.price_df['Tag'] == 2)] #1漲 2跌
        self.price_tag = price_tag          

In [7]:
class NewsGet:
    """
    
    """
    def __init__(self, price_tag, news_collection, start_date_timedelta, end_date_timedelta):
        self.price_tag = price_tag
        self.news_collection = news_collection
        self.start_date_timedelta = start_date_timedelta
        self.end_date_timedelta = end_date_timedelta

    def get_news(self):
        price_tag = self.price_tag
        # Array for collecting data
        news_collect = []
        news_tag_collect = []
        news_markdate_collect = [] 
        
        # Get data from DB with selecting date-query
        i = 1 
        for index, row in price_tag.iterrows():
            if row['Tag'] == 1: # 在同一個'時間點'前後，分類編號Tag歸類一樣。
                news_tag = str(i) + 'U'
            else:
                news_tag = str(i) + 'D'
            # tag, start, today ,end
            price_tag.set_value(index, 'Newstag', news_tag) #未來比較、視覺化時，可以使用
            start = row['Datee'] + datetime.timedelta(days=self.start_date_timedelta)
            end = row['Datee'] + datetime.timedelta(days=self.end_date_timedelta)
            today = row['Datee']
            
            # 進入db取得資料，彙整成
            for item in self.news_collection.find({'Date':{'$gte': start, '$lte': end}}):
                news_collect.append(item)
                news_tag_collect.append(news_tag)
                news_markdate_collect.append(today)
            i += 1
        # news_df: create 
        news_df = pd.DataFrame(news_collect)
        news_df['Newstag'] = news_tag_collect
        news_df['Markday'] = news_markdate_collect
        # news_df: delete _id
#         del news_df['_id']
        # news_df: 發生前or發生後
        news_df['BorA'] = 0
        for index, row in news_df.iterrows():
            if row['Date'] >= row['Markday']:
                news_df.set_value(index, ['BorA'], 'after')
            else:
                news_df.set_value(index, ['BorA'], 'before')
        self.news_df = news_df
        self.price_tag = price_tag
        
    def content_cleaner(self, text_string):
        text_string = re.sub(r1,'',text_string)
        text_string = re.sub(r2,'',text_string)
        text_string = re.sub(r3,'',text_string)
        text_string = re.sub(r4,'',text_string)
        return text_string        
    
    def news_seperate_to_UD_AB(self):
        news_df = self.news_df
        news_U = news_df[news_df['Newstag'].str.contains('U')].drop_duplicates(['Link'], keep='first')
        news_D = news_df[news_df['Newstag'].str.contains('D')].drop_duplicates(['Link'], keep='first')
        self.news_U_B = news_U[news_U['BorA'].str.contains('before')]
        self.news_U_A = news_U[news_U['BorA'].str.contains('after')]
        self.news_D_B = news_D[news_D['BorA'].str.contains('before')]
        self.news_D_A = news_D[news_D['BorA'].str.contains('after')]

    def news_X_X_count_printer(self):
        print('漲跌-價格配對，新聞數量：')
        print('大漲前',self.start_date_timedelta, '天：', len(self.news_U_B),'\n',
          '大漲後', self.end_date_timedelta, '天：', len(self.news_U_A))
        print('大跌前',self.start_date_timedelta, '天：', len(self.news_D_B),'\n',
          '大跌後', self.end_date_timedelta, '天：', len(self.news_D_A))

    def news_get_mixed_title_content(self, news_X_X): # NOT SAVING TO SELF
        news_title = ''
        news_content = ''
        for i in news_X_X['Title']:
            news_title += i
        for i in news_X_X['Content']:
            news_content += i
        return self.content_cleaner(news_title), self.content_cleaner(news_content)  # NOT SAVING TO SELF

In [8]:
class StringAnalyzer:
    def __init__(self, input_string):
        self.input_string = input_string
        
    def content_cleaner(self, text_string):
        text_string = re.sub(r1,'',text_string)
        text_string = re.sub(r2,'',text_string)
        text_string = re.sub(r3,'',text_string)
        text_string = re.sub(r4,'',text_string)
        return text_string  
    
    def string_cut(self, high_freq_choice = 1, show_count = 5, texts_count = 2):
        input_string = self.content_cleaner(self.input_string)
        # 0 seg_word_list: 總斷詞列表。
        seg_word_list = []
        seg_words = jieba.cut(input_string, cut_all=False)
        for word in seg_words:
            seg_word_list.append(word)

        # 1 資訊
        words_used_count = [('total_words', len(seg_word_list)), ('used_words', len(set(seg_word_list)))]

        # 2 sorted_seg_word_fdist: 以次數由高至低排列列表。
        sorted_seg_word_fdist = sorted(FreqDist(seg_word_list).items(), key = operator.itemgetter(1), reverse=True)

        # 3 content_high_freq_words_tuple: 高次數詞使用列表+次數。
        high_freq_words_tuple = []
        high_freq_words = []
        if high_freq_choice == 1:         # 3.1 以次數>show_count，字組字數>texts_count，為選擇
            for item_tuple in sorted_seg_word_fdist:
                if item_tuple[1] >= show_count and len(item_tuple[0]) >= texts_count:
                    high_freq_words_tuple.append(item_tuple)
        elif high_freq_choice == 2:     # 3.2 以次數前10名，字組字數>texts_count，為選擇
            temp_list = []
            for item_tuple in sorted_seg_word_fdist:
                if len(item_tuple[0]) >= texts_count:
                    temp_list.append(item_tuple)                
            high_freq_words_tuple = temp_list[:10]
            
        # 4 high_freq_words: 高次數詞列表。
        for item in high_freq_words_tuple:
            high_freq_words.append(item[0])

        self.seg_word_list = seg_word_list
        self.words_used_count = words_used_count
        self.sorted_seg_word_fdist = sorted_seg_word_fdist
        self.high_freq_words_tuple = high_freq_words_tuple
        self.high_freq_words = high_freq_words  

In [9]:
class EveryNewsAnalyzer:
    """
        由新聞分類別後(ex: news_U_B, type=df)，由此func進行蒐集各新聞內的high_freq_words。
        並進一步輸出成一個 word_list(node_list),一個由node配對而成的edge_list(key:value), 還有一個edge_list_key為key
    
    """
    
    def __init__(self, news_X_X):
        self.news_X_X = news_X_X

    def high_freq_words(self, 
                        high_freq_choice=2, 
                        show_count = 5, 
                        texts_count = 2):
        word_list = []
        for news in self.news_X_X['Content']:
            s = StringAnalyzer(news)
            s.string_cut(
                high_freq_choice = high_freq_choice, 
                show_count = show_count, 
                texts_count = texts_count)
            for word in s.high_freq_words:
                word_list.append(word)
        word_list = list(set(word_list))
        self.news_X_X_node_list = word_list
    
        news_X_X_edge_list = []
        for item in list(itertools.combinations(self.news_X_X_node_list, 2)):
            news_X_X_edge_list.append({item:0})
        news_X_X_edge_key_list = []
        for i in range(len(news_X_X_edge_list)):
            news_X_X_edge_key_list.append(list(news_X_X_edge_list[i])[0])
        self.news_X_X_edge_list = news_X_X_edge_list
        self.news_X_X_edge_key_list = news_X_X_edge_key_list

    def check_item_index(self, item, item_list):
        if item in item_list:
            return item_list.index(item)
        else:
            item = (item[1], item[0])  # change order
            return item_list.index(item)

    def coshow_count(self, 
                     high_freq_choice=2, 
                     show_count = 5,
                     texts_count = 2):
        for news in self.news_X_X['Content']:
            word_list = []
            s = StringAnalyzer(news)
            s.string_cut(
                high_freq_choice = high_freq_choice, 
                show_count = show_count, 
                texts_count = texts_count)
            for word in s.high_freq_words:
                word_list.append(word)
            temp_pairs = list(itertools.combinations(word_list, 2))
            for item in temp_pairs:
                dic = self.news_X_X_edge_list[self.check_item_index(item, self.news_X_X_edge_key_list)]
                dic.update({list(dic.keys())[0]: list(dic.values())[0] + 1})
                self.news_X_X_edge_list[self.check_item_index(item, self.news_X_X_edge_key_list)] = dic
            temp_dict = dict()
            temp_list = []
            for index, element in enumerate(self.news_X_X_edge_list):
                temp_dict[index] = list(element.values())[0]
            sorted_data = sorted(temp_dict.items(), key=operator.itemgetter(1), reverse=True)
            for item in sorted_data:
                temp_list.append(self.news_X_X_edge_list[item[0]])
            self.news_X_X_edge_list = temp_list 

In [15]:
#### Get price and modify it into price_df
p = PriceDataModify(price_collection)
p.get_price()
p.modify_price()
p.mark_price()
p.get_marked_price()

# Get news from news_collection with price_tag's info
n = NewsGet(p.price_tag, news_collection, -5, 5)
n.get_news()

# Split n.self.news_df into 4 parts// check the n.news_X_X
n.news_seperate_to_UD_AB()

# All news mixed analysis
news_U_B_Title_mixed, news_U_B_Content_mixed = n.news_get_mixed_title_content(n.news_U_B) #string
news_U_A_Title_mixed, news_U_A_Content_mixed = n.news_get_mixed_title_content(n.news_U_A) #string
news_D_B_Title_mixed, news_D_B_Content_mixed = n.news_get_mixed_title_content(n.news_D_B) #string 
news_D_A_Title_mixed, news_D_A_Content_mixed = n.news_get_mixed_title_content(n.news_D_A) #string

#StringAnalyze
n_U_B_T_m = StringAnalyzer(news_U_B_Title_mixed)
n_U_A_T_m = StringAnalyzer(news_U_A_Title_mixed)
n_U_B_T_m.string_cut(high_freq_choice=2)
n_U_A_T_m.string_cut(high_freq_choice=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


KeyError: '_id'

In [16]:
p = PriceDataModify(price_collection)
p.get_price()
p.modify_price()
p.mark_price()
p.get_marked_price()

In [19]:
p.price_tag

Unnamed: 0_level_0,Close,Datee,C_back1,C_forward1,Tag,Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-07-22,0.05,2010-07-22,0.06,0.08,1,1.6
2010-07-24,0.05,2010-07-24,0.05,0.06,1,1.2
2010-10-08,0.09,2010-10-08,0.09,0.07,2,0.777778
2010-10-27,0.19,2010-10-27,0.17,0.15,2,0.789474
2010-11-06,0.39,2010-11-06,0.34,0.26,2,0.666667
2010-11-08,0.24,2010-11-08,0.21,0.34,1,1.416667
2010-11-16,0.22,2010-11-16,0.23,0.27,1,1.227273
2010-12-04,0.2,2010-12-04,0.19,0.25,1,1.25
2010-12-09,0.2,2010-12-09,0.2,0.24,1,1.2
2011-01-14,0.4,2011-01-14,0.39,0.32,2,0.8


In [47]:
# EveryNewsAnalyzer
e = EveryNewsAnalyzer(n.news_U_B)
e.high_freq_words()
e.coshow_count()

In [None]:
class NetworkGraph:
    def __init__(self, node_list, edge_list):
        self.node_list = node_list
        self.edge_list = edge_list
        
    def set_graph(self):
        # INITIAL
        G = nx.MultiGraph()
        # ADD NODE DATA
        for item in self.node_list:
            G.add_node(item)
        # ADD EDGE DATA
        for i in range(len(self.edge_list)):
            if list(self.edge_list[i].values())[0]>0:
                G.add_edges_from([list(self.edge_list[i].keys())[0]],
                                 weight = list(self.edge_list[i].values())[0])
        self.G = G
        
    def draw_graph(self):
        nx.draw(self.G)
    def check_graph_info(self):
        return self.G.nodes(), self.G.edges()

In [None]:
# NetworkGraph
ng = NetworkGraph(e.news_X_X_node_list, e.news_X_X_edge_list)
ng.set_graph()
ng.draw_graph()

In [None]:
ng.edge_list