In [1]:
from google.cloud import storage
from google.cloud.exceptions import NotFound

class GSProcessor():
    
    def __init__(self, bucket_name='tbrain-tsmc', destination_blob_name='data/'):
        self.bucket_name = bucket_name
        self.destination_blob_name = destination_blob_name
        
        self.storage_client = storage.Client()
        try:
            self.storage_client.get_bucket(self.bucket_name)
        except NotFound:
             print("請聯繫Sponsor開通權限!")
                
        self.bucket = self.storage_client.bucket(self.bucket_name)
         
    def upload_blob(self, source_file_name, destination_file_name):
        blob = self.bucket.blob(self.destination_blob_name + destination_file_name)
        blob.upload_from_filename(source_file_name)
        print("File {} uploaded to {}.".format(source_file_name, self.destination_blob_name))
        
    def delete_file(self, source_file_name):
        d_file = self.destination_blob_name + source_file_name
        try:
            self.bucket.delete_blob(d_file)
            print("File {} id deleted successfully.".format(d_file))
        except NotFound:
            print("File {} doesn't exist.".format(d_file))
        
    def show_files(self,max_results_=10, prefix_='data/YUSUN.zip/YUSUN/textFiles/'):
        all_blobs = list(self.storage_client.list_blobs(self.bucket,max_results=max_results_, prefix=prefix_))
        print(all_blobs)
        
    def get_bobs(self,max_results_=10, prefix_='data/YUSUN.zip/YUSUN/textFiles/'):
        all_blobs = self.storage_client.list_blobs(self.bucket,max_results=max_results_, prefix=prefix_)
        return all_blobs
    
    def get_blob(self, source_file_name):
        try:
            blob = storage.Blob(source_file_name, self.bucket)
            content = blob.download_as_string()
            return content
        except NotFound:
            return None

#### result format: 
``
[{'news_ID': 1,
  'hyperlink': 'https://news.cnyes.com/news/id/4352432',
  'content': {'title': '量化交易追求絕對報酬 有效對抗牛熊市',
   'context': '近年來投資市場波動越來越明顯，追求低波動、絕對報酬的量化交易備受注目。專家表示，採用量化交易策略投資台股，不管是處於多頭或是空頭市場，績效及波動度均可領跑大盤，甚至比國內投資台股的股票型基金及 ETF 的波動率還低，表現也更為穩定。\n大數據時代來臨，風行歐美 50 年的量化交易儼然成為顯學，台灣亦開始重視此一趨勢發展，也因此，中華機率統計學會及台北科技大學管理學院攜手主辦，並由元大期貨、...'},
  'name': '[]'},
  {...},
  {...}
  ]
``

In [None]:
import pandas as pd
import json
import re

class DataProcessor(GSProcessor):
    
    def init(self, bucket_name='tbrain-tsmc', destination_blob_name='data/'):
        super().__init__(self, bucket_name, destination_blob_name)
        
    def combine_all_file(self, tbrainfile='assert/tbrain_train_final_0610.csv', file='assert/tbrain_train_title_context.txt'):
        df = pd.read_csv(tbrainfile)
        df_json = json.loads(df.to_json(orient='records', force_ascii=False))

        for i in df_json:
            news_ID = i['news_ID']-1
            f = 'data/YUSUN.zip/YUSUN/textFiles/'+ str(news_ID) +'.txt'
            
            content = self.get_blob(f)
            i['content'] =  json.loads(content) if content is not None else json.loads('{"title": "", "context": ""}')

        with open(file, 'w') as output:
            json.dump(df_json, output, ensure_ascii=False)

    def get_all_context(self, file='training_data/tbrain_train_title_context.txt'):
        f = self.get_blob(file)
        contexts = []
        
        data = json.loads(f.decode('utf-8'))
        for row in data:
            contexts.append(row['content']['context'])
            
        return contexts

    def get_all_title(self, file='training_data/tbrain_train_title_context.txt'):
        f = self.get_blob(file)
        titles = []
        
        data = json.loads(f.decode('utf-8'))
        for row in data:
            titles.append(row['content']['title'])
            
        return titles

    def get_all_name(self, file='training_data/tbrain_train_title_context.txt'):
        f = self.get_blob(file)
        names = []
        
        data = json.loads(f.decode('utf-8').encode('utf-8'))
        for row in data:
            names.append(row['name'])
        
        return names
        
    def get_all(self, file='training_data/tbrain_train_title_context.txt', preprocessed=True):
        f = self.get_blob(file)
        titles = []
        names = []
        contexts = []
        
        data = json.loads(f.decode('utf-8'))
        for row in data:
            title = row['content']['title']
            context = row['content']['context']
            
            if preprocessed:
                title  = self.preprocessing(title)
                context = self.preprocessing(context)
                    
            names.append(row['name'])
            titles.append(title)
            contexts.append(context)
          
        return titles, names, contexts
        
    def preprocessing(self, s):
#         移除非中文字
#         pattern = re.compile(r'[^\u4e00-\u9fa5]')
#         s = re.sub(pattern, '', s)
        try:
            s = s.encode('ISO-8859-1').decode('utf-8', 'ignore')
        except:
            s = s.encode('utf-8').decode('utf-8')
            
        s = s.replace('\n', '').replace('\r', '')
        
        return s