<a href="https://colab.research.google.com/github/c-c-c-c/dm_integration/blob/master/myMecab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 吉村インテグレーションステップ 

目的は、ドラマの構成/脚本として、「どのタイミングで」「どんな
出来事」が起こると良いかという示唆を見出すこと。

★目的変数
　　ドラマの初回視聴率からの上下動(%で、閾値を良い、悪い、
　　普通になるように３パターン準備)

★特徴量
　　-ドラマを恋愛、刑事、ヒューマンなどのカテゴリーに分ける
　　-さらに、ドラマをステージ(序盤、中盤、終盤)に分ける
　　-ドラマ名-ステージを行にした、単語×ドラマステージ行列を作成

★モデル
　　- 目的変数に対しロッソ回帰を行う。
　　- もしくは、教師なしのグルーピングを行う。(kmeans, トピックモデル)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7

In [0]:
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -a

In [0]:
pip install chardet

In [0]:
import joblib
import MeCab
import numpy as np
import pandas as pd
import re
import json
import pprint
import IPython.core.display as display
import IPython.display
from scipy.sparse import csr_matrix
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso


from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import LinearSVC

## データの読み込み

In [0]:
# EPGデータ(手作業での修正中))
df_epg = pd.read_excel("./drive/My Drive/0_インテグ作業/data/EPG_checking0212.xlsx")

In [0]:
f= open("./drive/My Drive/0_インテグ作業/data/drama_category0220.json", 'r')

drama_category_dic = json.load(f) #JSON形式で読み込む



In [0]:
# pprint.pprint(drama_category_dic)

In [0]:
f= open("./drive/My Drive/0_インテグ作業/data/drama_win_lose.json", 'r')

drama_win_lose_dic = json.load(f) #JSON形式で読み込む



In [0]:
df_epg["sharp_epg_tknz"] = np.nan

In [0]:
# ゴミ除去

def removeTrash (text):
    import re

    result_text = text
    result_text = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+", "", result_text)
    result_text = re.sub(r"番組詳細|制作・著作|制作著作", "", result_text)
    result_text = re.sub(r"[!\(\)=『』～/]", "", result_text)
    result_text = re.sub(r"フジテレビ|日本テレビ|TBS|テレビ朝日|TBS|関西テレビ", "", result_text)
    result_text = re.sub(r"\d+", "", result_text)
    
    result_text = re.sub(r"【公式.*?】", "", result_text)
    result_text = re.sub(r"\u3000", "", result_text)

    return result_text


In [0]:
mecab = MeCab.Tagger()
mecab.parse("")
for i, text in enumerate( df_epg['sharp_epg_hand_corrected']):
    text_tokenized = []

    # URL、記号などのゴミを取り除く
    if type(text) is not str: 
        if np.isnan(text) :
            continue 
    text = removeTrash(text)
    node = mecab.parseToNode(text)
    while node:
        node = node.next
        if node is None:
            continue

        if not node.feature.startswith("BOS/EOS") and not node.feature.startswith("助詞") and\
            not node.feature.startswith("記号") and\
            node.feature.find("人名") == -1 and\
            not node.feature.startswith("助動詞"):
            text_tokenized.append(node.surface)

    df_epg["sharp_epg_tknz"].iloc[i] = text_tokenized

In [0]:
# 手作業での前処理が完了してないので、完了したものだけにする
bool_list = [] 

for i in range(len(df_epg["sharp_epg_hand_corrected"])):
    bool_list.append( type( df_epg["sharp_epg_tknz"].iloc[i] ) != float )

In [0]:
df_notnull =  df_epg[bool_list]

In [0]:
df_notnull["phys_cnt"] = np.nan
df_notnull["category"] = np.nan
df_notnull =  df_notnull.rename(columns={ 'Unnamed: 0' : 'sort_id' } )

In [0]:
drama_category_dic.keys()

In [0]:
# 物理カウントとカテゴリーを加える
love_cnt = 0
police_cnt = 0
for tmp_key  in df_notnull["drama_key"].unique():
    cnt = 0

    if tmp_key in drama_category_dic["love"]:
        love_cnt += 1

        qry_l = " drama_key == @tmp_key"
        target_idx = df_notnull.query(qry_l).index
        df_notnull[ "category" ].loc [target_idx] = "love"


    if tmp_key in drama_category_dic["police"]:
        police_cnt += 1
        qry_p = " drama_key == @tmp_key"
        target_idx = df_notnull.query(qry_p).index
        df_notnull[ "category" ].loc [target_idx] = "police"

    for sort_id in  df_notnull[df_notnull["drama_key"] == tmp_key]['sort_id'].values:
        cnt  += 1

        qry = " sort_id == @sort_id"
        target_idx = df_notnull.query(qry).index
        df_notnull["phys_cnt"].loc[target_idx] = cnt

    # print(tmp_key)



In [0]:
# df_stage ステージごとに作る

df_stage = pd.DataFrame( columns=['drama_key','drama_title','stages','epg_joined','drama_category','win_lose'] )


for tmp_key in df_notnull.drama_key.unique():

    win_or_lose = ""
    if tmp_key in drama_win_lose_dic["win"]:
        win_or_lose = "win"
    elif tmp_key in drama_win_lose_dic["draw"]:
        win_or_lose = "draw"
    elif tmp_key in drama_win_lose_dic["lose"]:
        win_or_lose = "lose"
    else:
        print("????")

    tmp_cat = str(df_notnull[ df_notnull["drama_key"] == tmp_key ].category.values[0])
    tmp_title = str(df_notnull[ df_notnull["drama_key"] == tmp_key ].drama_title.values[0])

    for tmp_stage in ["early","middle" ,"late"]:

        qry = ""
        if tmp_stage == "early":
            qry = "drama_key == @tmp_key & phys_cnt < 4 "

        elif tmp_stage == "middle":
            qry = "drama_key == @tmp_key & 4< phys_cnt < 7 "

        else:
            qry = "drama_key == @tmp_key & phys_cnt >= 7 "

        target_i =  df_notnull.query(qry)["sharp_epg_hand_corrected"]
        epg_joined = str(df_notnull.query(qry)["sharp_epg_hand_corrected"].values)

        tmp_se = pd.Series( [ tmp_key, tmp_title,tmp_stage, epg_joined  ,tmp_cat, win_or_lose ]  , index=df_stage.columns  )
        df_stage = df_stage.append( tmp_se, ignore_index=True )



In [0]:
df_stage.iloc[1:200]


In [0]:
# 空の列を足す
df_stage["epg_tknz"] = np.nan

In [0]:
mecab = MeCab.Tagger()
mecab.parse("")
for i, text in enumerate( df_stage['epg_joined']):
    text_tokenized = []

    # URL、記号などのゴミを取り除く
    if type(text) is not str: 
        if np.isnan(text) :
            continue 
    text = removeTrash(text)
    node = mecab.parseToNode(text)
    while node:
        node = node.next
        if node is None:
            continue

        if not node.feature.startswith("BOS/EOS") and not node.feature.startswith("助詞") and\
            not node.feature.startswith("記号") and\
            node.feature.find("人名") == -1 and\
            not node.feature.startswith("助動詞"):
            text_tokenized.append(node.surface)

    df_stage["epg_tknz"].iloc[i] = text_tokenized

In [0]:
df_stage["epg_tknz"]

In [0]:
df_stage["win_lose_dummy"] = np.nan
for i in range(len( df_stage["win_lose"] )):

    if df_stage["win_lose"].iloc[i] == "win":
        df_stage["win_lose_dummy"].iloc[i] = 1

    elif df_stage["win_lose"].iloc[i] == "draw":
        df_stage["win_lose_dummy"].iloc[i] = 0

    elif df_stage["win_lose"].iloc[i] == "lose":
        df_stage["win_lose_dummy"].iloc[i] = -1
        
    else :
        print("???")
    # df_stage["win_lose_dummy"] = 

In [0]:
# 行列の中で1より大きい数値を１にならす
def bool_bow ( argX ) :

    # 各要素を見に行って２以上なら１補正
    for j in range(argX.shape[1]):
        tmp_sum = 0
        for i in range(argX.shape[0]):

            if argX[i, j] > 1:
                argX[i, j] = 1

    return argX


In [0]:
def check_rare_words ( argX , pre_vectorizer ) :
    for j in range(argX.shape[1]):
        for i in range(argX.shape[0]):
            if argX[i, j] > 1:
                argX[i, j] =   1
            
    tmp_df_sp = pd.DataFrame(argX.toarray(), columns=[ x[0] for x in sorted(pre_vectorizer.vocabulary_.items(), key=lambda x: x[1]) ])

    result_rare_words =[]

    for   tmp_word in tmp_df_sp.columns:
        tmp_sum = tmp_df_sp[tmp_word].sum()

        if  tmp_sum < 5:
            result_rare_words.append(tmp_word)
    # import pdb; pdb.set_trace()
    return result_rare_words
    

In [0]:
# ステージごとに分けた後のdfを引数にとる
not_rare_words = []
particular_stop_words = ["いる", "おり", "arata","uend", "c","C" , "この","その" ,"そう","あり","際",\
                        "genking","件","メゾン","しれ","きり","u","U","あり","回",\
                        "お","leola","ください","また",\
                        "後","ー","a","A","なく","or","OR",\
                        "ii","II","まま","皆","百","ーー","j","J","どう","x","X","sit","SIT","通",\
                        "TSUTAYA","かつ","無い","性","しまい","不","陽",\
                        "性","将","点","結","負っ",\
                        "luz","その後","すると","方","屋","ほう","フジオカ","事","二","くれる","leg",\
                         "genking","さ","ところ",\
                         "太郎","数","すると","あげ","上","takahiro",\
                        "み","メゾン","ある","よう","あっ","ほしい","彼ら","状","megumi",\
                         "なぜ","上げ","まるで","nima","NIMA","j","ii","その後","ビョンホン","x",\
                         "tsutaya","頃","げ","史","これ","あ","ディーン・フジオカ","(ディーン・フジオカ)"\
                         "genking","ユリカ", "BAR","FUJIOKA","GENKING","ARATA","X","MEGUMI", "メゾン・フローラル", 
                         "BANZAI","s","S","一","・フローラル","TAKAHIRO","NAYUTAWAVE","GENKING",
                         "でき","他","ウイルステロブラッディ","ただ","RECORDS","KAMIYA","GEROGE","K",
                         "の","ド","たち","ん","ウイルステロブラッディ・マンデイ","ら","が","uFUJIOKA",
                         "れる","し","何","カヲル","いう","ela","いっ","もと","r","だが","僕","アイ","DEAN",
                         "メグリン","シュウメイリン","オリヴィエ","き","uEND","かけ","いっ","し","ザ","れる","コンテンポラリーダンサー",\
                         "Kis","メグリン","ヴィ","JMT","r","R","smap","SMAP",
                         ]

def fit_trans_lasso (df_tmp) :
    pre_vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
    Xpre = pre_vectorizer.fit_transform(
        [str(i) for i in df_tmp["epg_tknz"].values]
    )
    rare_words = check_rare_words(Xpre, pre_vectorizer)

    # import pdb; pdb.set_trace()
    bow_vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")

    not_rare_words = []

    # rare_wordとstop_wordを確認する

    dbg_cnt = 0 ###

    for tmp_word_list in  df_tmp["epg_tknz"].values:
        tmp_list = []
        for tmp_word in tmp_word_list:
            # print("before_chck"+str(tmp_word))
            if tmp_word not in rare_words and tmp_word not in particular_stop_words:
                print("after_chck"+str(tmp_word))

                # if tmp_word == "genking":
                    # print("????why genking???")
                tmp_list.append(tmp_word)
                dbg_cnt +=1###
                
        not_rare_words.append(tmp_list)
        if dbg_cnt > 20 :###
             ####break ### debug用
            pass



    X = bow_vectorizer.fit_transform( 
        [str(i) for i in not_rare_words] 
    )
    X = bool_bow( X )

    Y = df_tmp["win_lose_dummy"]

    scaler = StandardScaler(with_mean=False)
    clf = Lasso(alpha=0.1)

    # 標準化する必要はない??
    X = scaler.fit_transform(X)
    
    clf.fit(X, Y)

    result_df = pd.DataFrame(clf.coef_.T , index=bow_vectorizer.vocabulary_ )

    return result_df


In [0]:
rare_words

In [0]:
class HorizontalDisplay:
    def __init__(self, *args):
        self.args = args

    def _repr_html_(self):
        template = '<div style="float: left; padding: 10px;">{0}</div>'
        return "\n".join(template.format(arg._repr_html_())
                         for arg in self.args)


In [0]:
#######
# early middle lateステージごとの結果を出す
#

def calc_lasso_by_stage (df_orijin):
    df_early =  df_orijin[df_orijin["stages"] == "early"]
    df_middle =  df_orijin[df_orijin["stages"] == "middle"]
    df_late =  df_orijin[df_orijin["stages"] == "late"]

    result_late = fit_trans_lasso (df_late) 
    result_early = fit_trans_lasso (df_early) 
    result_middle = fit_trans_lasso (df_middle) 

    dfe = result_early[ result_early[0] != 0].sort_values(0)
    dfm =result_middle[ result_middle[0] != 0].sort_values(0)
    dfl =result_late[ result_late[0] != 0].sort_values(0)

    # print(dfl)
    display.display(HorizontalDisplay(dfe, dfm, dfl))


In [0]:
calc_lasso_by_stage(df_stage[df_stage['drama_category'] =="love"]  )

In [0]:
for tmp_e in df_stage[df_stage['drama_category'] =="love"]["epg_tknz"]:
    print(tmp_e)

In [0]:
# particular_stop_words

In [0]:
# rare_words = []
tmp_word = "genking"
if tmp_word not in rare_words and tmp_word not in particular_stop_words:
    print("c")

In [0]:
rare_words

In [0]:
calc_lasso_by_stage(df_stage[df_stage['drama_category'] =="police"]  )

In [0]:
for tmp_e in df_stage[df_stage['drama_category'] =="police"]["epg_tknz"]:
    print(tmp_e)

In [0]:
df_nan = df_stage[df_stage['drama_category'] =="nan"]
# win_lose_dummyにnanがあるので・・
df_nan = df_nan[ ~pd.isnull(df_nan['win_lose_dummy'])]

calc_lasso_by_stage(df_nan )



In [0]:
from PIL import Image
import wordcloud, codecs
wordc = wordcloud.WordCloud(font_path='HGRGM.TTC',
        background_color='white',
        mask=msk,
        contour_color='steelblue',
        contour_width=2).generate(splitted)
wordc.to_file('sample-wordCloud-jpn.png')

In [0]:
#####################
#
#以下、授業の残り
#
####################



In [0]:
# ※これは、演習用に単語文書行列を DataFrame に変換して見やすくしてみるためのコードで、覚える必要はありません
#pd.DataFrame(X.toarray(), columns=[ x[0] for x in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]) ])