In [1]:
import numpy as np
import os
import pandas as pd

from abeja.datalake import Client as DatalakeClient

# set datalake channel_id
channel_id = 'XXXXXX'

In [2]:
#データレイクに存在するファイル名とURLをもって来る関数
def load_latest_file_from_datalake(channel_id):
    datalake_client = DatalakeClient()
    channel = datalake_client.get_channel(channel_id)
    dic_url = {}

    # load latest file path
    for f in channel.list_files(sort='-uploaded_at'):
        dic_url = {f.metadata['filename']:f.download_url for f in channel.list_files() if 'filename' in f.metadata}

    return dic_url

In [3]:
dic_url = load_latest_file_from_datalake(channel_id)
print(dic_url)

{'dokujo-tsushin.txt': 'https://abeja-datalake-production.s3.amazonaws.com/4a28-1788540696783/20190618/105138-9613a864-2f66-403b-a2fc-01ec22718d3f?AWSAccessKeyId=AKIAIFE7IPCVDCYAQWZQ&Signature=l%2BYWS8LItcXOY0j2liIAg5J6joA%3D&Expires=1561002330', 'it-life-hack.txt': 'https://abeja-datalake-production.s3.amazonaws.com/4a28-1788540696783/20190618/105351-b5ff74e8-6979-49d1-b6f2-3254d6947ce3?AWSAccessKeyId=AKIAIFE7IPCVDCYAQWZQ&Signature=%2Fd8i4%2FM42DtaopGUdTTLnLwZj8U%3D&Expires=1561002330', 'kaden-channel.txt': 'https://abeja-datalake-production.s3.amazonaws.com/4a28-1788540696783/20190618/105558-cedeca28-333f-4e1a-b5d9-397fca427bea?AWSAccessKeyId=AKIAIFE7IPCVDCYAQWZQ&Signature=Po3dg66O7prPZ3LvLuhjjR929pQ%3D&Expires=1561002330', 'livedoor-homme.txt': 'https://abeja-datalake-production.s3.amazonaws.com/4a28-1788540696783/20190618/105726-29d8e4ed-c560-4ffe-a8de-0359a5530970?AWSAccessKeyId=AKIAIFE7IPCVDCYAQWZQ&Signature=72jQTg67nUlbX0HEkqqAnDCZyEg%3D&Expires=1561002330', 'movie-enter.txt': '

In [4]:
#前処理、名詞のみ対象として解析を実施する
import urllib
tag_list = []
word_list = {}
doc_list = []

for key, value in dic_url.items():
    key = key.replace(".txt","")
    tag_list.append(key) 
    word_list[key] = [] 
    
    with urllib.request.urlopen(value) as f:
        text_data = f.read().decode('utf-8')
        text_data_list = text_data.split("\n")
    
        for text in text_data_list:
            if text == 'EOS' or text =="":
                word_list[key].append(doc_list)
                doc_list = []
            else:
                text = text.split("\t")
                word = text[0]
                word_meta = text[1]
                
                word_detail = word_meta.split(",")
                if word_detail[0] == '名詞':
                    doc_list.append(word)

In [5]:
tag_list

['dokujo-tsushin',
 'it-life-hack',
 'kaden-channel',
 'livedoor-homme',
 'movie-enter',
 'peachy',
 'smax',
 'sports-watch',
 'topic-news']

In [6]:
#gensimのinstall
!pip3 install gensim

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/abeja/platform-public/pypi/simple
[33mYou are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [7]:
all_data = []
all_label = []

#全てのデータを結合
for index, tag in enumerate(tag_list):    
    number = int(len(word_list[tag]))
    label = [index] 
    all_data = all_data + word_list[tag]
    all_label = all_label + label *  number

In [8]:
#出現ワードの辞書を作成
from gensim import corpora

dic = corpora.Dictionary(all_data)

# 「出現頻度が20未満の単語」と「30%以上の文書で出現する単語」を排除
dic.filter_extremes(no_below = 20, no_above = 0.3)
bow_corpus = [dic.doc2bow(d) for d in all_data]

#辞書の保存
dic.save_as_text('../model/livedoordic.txt')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [9]:
#TFIDFモデルの作成と適用
from gensim import models

tfidf_model = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

#tf-idfモデルの保存
tfidf_model.save('../model/tfidf_model.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [10]:
#LSIモデルの作成と次元圧縮(100次元)
from gensim import models

dime = 100
lsi_model = models.LsiModel(tfidf_corpus, id2word = dic, num_topics = dime)
lsi_corpus = lsi_model[tfidf_corpus]

#次元圧縮後のモデルを保存
lsi_model.save('../model/lsi_model.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [11]:
#gensimコーパスからdenseへ
from gensim import matutils

all_dense = list(matutils.corpus2dense(lsi_corpus, num_terms=dime, num_docs=len(all_data)).T)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

#トレーニング・テストデータの設定
train_data, test_data, train_label, test_label = train_test_split(all_dense, all_label, test_size=0.4, random_state=1)

#データの標準化
sc = StandardScaler()
sc.fit(all_dense)
train_std = sc.transform(train_data)
test_std = sc.transform(test_data)

In [13]:
#学習モデル(SVM)の作成
clf = SVC(C = 1, kernel = 'rbf')
clf.fit(train_std, train_label)



SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [14]:
#スコアの表示
score = clf.score(test_std, test_label)
print("{:.3g}".format(score))

0.913


In [15]:
#SVCモデルの保存
import pickle
filename = '../model/SVC_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [16]:
#xgboostのinstall
!pip3 install xgboost

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/abeja/platform-public/pypi/simple
[33mYou are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [17]:
import xgboost as xgb

# 学習データからXGBoost用のデータを生成
# モデルのインスタンス作成
XG_model = xgb.XGBClassifier()
XG_model.fit(train_std, train_label)

#スコアの表示
y_test_pred = XG_model.predict(test_std)
sum(XG_model.predict(test_std) == test_label) / len(test_label)

0.8973229413758048

In [18]:
#SVCモデルの保存
filename = '../model/Xgboost_model.sav'
pickle.dump(XG_model, open(filename, "wb"))

In [19]:
#LightGBMのinstall
!pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/abeja/platform-public/pypi/simple
[33mYou are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [20]:
import lightgbm as lgb
from sklearn import datasets

# 学習データからLightGBM用のデータを生成
# モデルのインスタンス作成

lgbm_params = {
        # 多値分類問題
        'objective': 'multiclass',
        # クラス数は 9
        'num_class': 9,
    }

lgb_train = lgb.Dataset(train_std, train_label)
lgb_eval = lgb.Dataset(test_std, test_label, reference=lgb_train)

lgbm_model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval)

[1]	valid_0's multi_logloss: 1.87991
[2]	valid_0's multi_logloss: 1.67013
[3]	valid_0's multi_logloss: 1.5109
[4]	valid_0's multi_logloss: 1.38019
[5]	valid_0's multi_logloss: 1.27073
[6]	valid_0's multi_logloss: 1.17732
[7]	valid_0's multi_logloss: 1.09745
[8]	valid_0's multi_logloss: 1.02543
[9]	valid_0's multi_logloss: 0.963025
[10]	valid_0's multi_logloss: 0.906399
[11]	valid_0's multi_logloss: 0.856282
[12]	valid_0's multi_logloss: 0.810944
[13]	valid_0's multi_logloss: 0.76976
[14]	valid_0's multi_logloss: 0.733119
[15]	valid_0's multi_logloss: 0.699309
[16]	valid_0's multi_logloss: 0.669019
[17]	valid_0's multi_logloss: 0.641399
[18]	valid_0's multi_logloss: 0.616398
[19]	valid_0's multi_logloss: 0.593147
[20]	valid_0's multi_logloss: 0.571513
[21]	valid_0's multi_logloss: 0.552192
[22]	valid_0's multi_logloss: 0.533945
[23]	valid_0's multi_logloss: 0.51773
[24]	valid_0's multi_logloss: 0.502202
[25]	valid_0's multi_logloss: 0.488006
[26]	valid_0's multi_logloss: 0.475481
[27]	v

In [21]:
#スコアの表示
test_pred = lgbm_model.predict(test_std, num_iteration=lgbm_model.best_iteration)
test_pred_max = np.argmax(test_pred, axis=1)
accuracy = sum(test_label == test_pred_max) / len(test_label)
print(accuracy)

0.9051169095221958


In [22]:
#LightGBMモデルの保存
filename = '../model/LightGBM_model.sav'
pickle.dump(lgbm_model, open(filename, "wb"))