<a href="https://colab.research.google.com/github/c-c-c-c/dm_integration/blob/master/myMecab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 吉村インテグレーションステップ 

目的は、ドラマの構成/脚本として、「どのタイミングで」「どんな
出来事」が起こると良いかという示唆を見出すこと。

★目的変数
　　ドラマの初回視聴率からの上下動(%で、閾値を良い、悪い、
　　普通になるように３パターン準備)

★特徴量
　　-ドラマを恋愛、刑事、ヒューマンなどのカテゴリーに分ける
　　-さらに、ドラマをステージ(序盤、中盤、終盤)に分ける
　　-ドラマ名-ステージを行にした、単語×ドラマステージ行列を作成

★モデル
　　- 目的変数に対しロッソ回帰を行う。
　　- もしくは、教師なしのグルーピングを行う。(kmeans, トピックモデル)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7

Reading package lists... Done
Building dependency tree       
Reading state information... Done
aptitude is already the newest version (0.8.10-6ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.
mecab is already installed at the requested version (0.996-5)
libmecab-dev is already installed at the requested version (0.996-5)
mecab-ipadic-utf8 is already installed at the requested version (2.7.0-20070801+main-1)
git is already installed at the requested version (1:2.17.1-1ubuntu0.5)
make is already installed at the requested version (4.1-9.1ubuntu1)
curl is already installed at the requested version (7.58.0-2ubuntu3.8)
xz-utils is already installed at the requested version (5.2.2-1.3)
file is already installed at the requested version (1:5.32-2ubuntu0.3)
mecab is already installed at the requested version (0.996-5)
libmecab-dev is already installed at the requested version (0.996-5)
mecab-ipadic-utf8 is already installed at the requested version (2.7.0-20070801+mai

In [3]:
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -a

fatal: destination path 'mecab-ipadic-neologd' already exists and is not an empty directory.
[install-mecab-ipadic-NEologd] : Start..
[install-mecab-ipadic-NEologd] : Check the existance of libraries
[install-mecab-ipadic-NEologd] :     find => ok
[install-mecab-ipadic-NEologd] :     sort => ok
[install-mecab-ipadic-NEologd] :     head => ok
[install-mecab-ipadic-NEologd] :     cut => ok
[install-mecab-ipadic-NEologd] :     egrep => ok
[install-mecab-ipadic-NEologd] :     mecab => ok
[install-mecab-ipadic-NEologd] :     mecab-config => ok
[install-mecab-ipadic-NEologd] :     make => ok
[install-mecab-ipadic-NEologd] :     curl => ok
[install-mecab-ipadic-NEologd] :     sed => ok
[install-mecab-ipadic-NEologd] :     cat => ok
[install-mecab-ipadic-NEologd] :     diff => ok
[install-mecab-ipadic-NEologd] :     tar => ok
[install-mecab-ipadic-NEologd] :     unxz => ok
[install-mecab-ipadic-NEologd] :     xargs => ok
[install-mecab-ipadic-NEologd] :     grep => ok
[install-mecab-ipadic-NEo

In [0]:
import joblib
import MeCab
import numpy as np
import pandas as pd
import re
import json
import pprint
from scipy.sparse import csr_matrix
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso


from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import LinearSVC

## データの読み込み

In [0]:
# EPGデータ(手作業での修正中))
df_epg = pd.read_excel("./drive/My Drive/0_インテグ作業/data/EPG_checking0212.xlsx")

In [0]:
f= open("./drive/My Drive/0_インテグ作業/data/drama_category0220.json", 'r')

drama_category_dic = json.load(f) #JSON形式で読み込む



In [0]:
# pprint.pprint(drama_category_dic)

In [0]:
f= open("./drive/My Drive/0_インテグ作業/data/drama_win_lose.json", 'r')

drama_win_lose_dic = json.load(f) #JSON形式で読み込む



In [0]:
df_epg["sharp_epg_tknz"] = np.nan

In [0]:
# ゴミ除去

def removeTrash (text):
    import re

    result_text = text
    result_text = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+", "", result_text)
    result_text = re.sub(r"番組詳細|制作・著作|制作著作", "", result_text)
    result_text = re.sub(r"[!\(\)=『』～/]", "", result_text)
    result_text = re.sub(r"フジテレビ|日本テレビ|TBS|テレビ朝日|TBS|関西テレビ", "", result_text)
    result_text = re.sub(r"\d+", "", result_text)
    
    result_text = re.sub(r"【公式.*?】", "", result_text)
    result_text = re.sub(r"\u3000", "", result_text)

    return result_text


In [120]:
mecab = MeCab.Tagger()
mecab.parse("")
for i, text in enumerate( df_epg['sharp_epg_hand_corrected']):
    text_tokenized = []

    # URL、記号などのゴミを取り除く
    if type(text) is not str: 
        if np.isnan(text) :
            continue 
    text = removeTrash(text)
    node = mecab.parseToNode(text)
    while node:
        node = node.next
        if node is None:
            continue

        if not node.feature.startswith("BOS/EOS") and not node.feature.startswith("助詞") and\
            not node.feature.startswith("記号") and\
            node.feature.find("人名") == -1 and\
            not node.feature.startswith("助動詞"):
            text_tokenized.append(node.surface)

    df_epg["sharp_epg_tknz"].iloc[i] = text_tokenized

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [0]:
# 手作業での前処理が完了してないので、完了したものだけにする
bool_list = [] 

for i in range(len(df_epg["sharp_epg_hand_corrected"])):
    bool_list.append( type( df_epg["sharp_epg_tknz"].iloc[i] ) != float )

In [0]:
df_notnull =  df_epg[bool_list]

In [123]:
df_notnull["phys_cnt"] = np.nan
df_notnull["category"] = np.nan
df_notnull =  df_notnull.rename(columns={ 'Unnamed: 0' : 'sort_id' } )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [124]:
drama_category_dic.keys()

dict_keys(['love', 'police'])

In [125]:
# 物理カウントとカテゴリーを加える
love_cnt = 0
police_cnt = 0
for tmp_key  in df_notnull["drama_key"].unique():
    cnt = 0

    if tmp_key in drama_category_dic["love"]:
        love_cnt += 1

        qry_l = " drama_key == @tmp_key"
        target_idx = df_notnull.query(qry_l).index
        df_notnull[ "category" ].loc [target_idx] = "love"


    if tmp_key in drama_category_dic["police"]:
        police_cnt += 1
        qry_p = " drama_key == @tmp_key"
        target_idx = df_notnull.query(qry_p).index
        df_notnull[ "category" ].loc [target_idx] = "police"

    for sort_id in  df_notnull[df_notnull["drama_key"] == tmp_key]['sort_id'].values:
        cnt  += 1

        qry = " sort_id == @sort_id"
        target_idx = df_notnull.query(qry).index
        df_notnull["phys_cnt"].loc[target_idx] = cnt

    # print(tmp_key)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [126]:
# df_stage ステージごとに作る

df_stage = pd.DataFrame( columns=['drama_key','drama_title','stages','epg_joined','drama_category','win_lose'] )


for tmp_key in df_notnull.drama_key.unique():

    win_or_lose = ""
    if tmp_key in drama_win_lose_dic["win"]:
        win_or_lose = "win"
    elif tmp_key in drama_win_lose_dic["draw"]:
        win_or_lose = "draw"
    elif tmp_key in drama_win_lose_dic["lose"]:
        win_or_lose = "lose"
    else:
        print("????")

    tmp_cat = str(df_notnull[ df_notnull["drama_key"] == tmp_key ].category.values[0])
    tmp_title = str(df_notnull[ df_notnull["drama_key"] == tmp_key ].drama_title.values[0])

    for tmp_stage in ["early","middle" ,"late"]:

        qry = ""
        if tmp_stage == "early":
            qry = "drama_key == @tmp_key & phys_cnt < 4 "

        elif tmp_stage == "middle":
            qry = "drama_key == @tmp_key & 4< phys_cnt < 7 "

        else:
            qry = "drama_key == @tmp_key & phys_cnt >= 7 "

        target_i =  df_notnull.query(qry)["sharp_epg_hand_corrected"]
        epg_joined = str(df_notnull.query(qry)["sharp_epg_hand_corrected"].values)

        tmp_se = pd.Series( [ tmp_key, tmp_title,tmp_stage, epg_joined  ,tmp_cat, win_or_lose ]  , index=df_stage.columns  )
        df_stage = df_stage.append( tmp_se, ignore_index=True )



????


In [127]:
df_stage.iloc[1:200]


Unnamed: 0,drama_key,drama_title,stages,epg_joined,drama_category,win_lose
1,1910_CX_月21,シャーロック,middle,['若宮潤一(岩田剛典)が誉獅子雄(ディーン・フジオカ)に文句を言っている。獅子雄は、同居し...,police,lose
2,1910_CX_月21,シャーロック,late,['誉獅子雄(ディーン・フジオカ)と若宮潤一(岩田剛典)が、いつものようにもめている。自分の...,police,lose
3,1910_CX_火22,まだ結婚できない男,early,['桑野信介(阿部寛)は腕のいい建築士だが、「メリットがない」という考えから結婚の経験はなく...,,draw
4,1910_CX_火22,まだ結婚できない男,middle,['かねてから鎌倉の仏像を愛してやまない、まどか(吉田羊)は、1泊2日の鎌倉旅行に有希江(稲...,,draw
5,1910_CX_火22,まだ結婚できない男,late,['ある日、桜子(咲妃みゆ)から、店舗デザインの仕事を持ち掛けられた桑野(阿部寛)。人が住む...,,draw
6,1910_CX_木22,モトカレマニア,early,['現在失業中の27歳独身・難波ユリカ(新木優子)は、見た目にも気を使い、コミュニケーション...,love,lose
7,1910_CX_木22,モトカレマニア,middle,['マコチ(高良健吾)は、ユリカ(新木優子)のもとへ謝りに行ったものの、追い返されてしまう。...,love,lose
8,1910_CX_木22,モトカレマニア,late,['ユリカ(新木優子)は、マコチ(高良健吾)から「好きだ」と告白され、もう一度彼と付き合うこ...,love,lose
9,1907_CX_木22,ルパンの娘,early,['☆笑い&ラブ&家族ドラマ&サスペンス&アクション&ミュージカル!エンタメてんこ盛りの贅沢...,,draw
10,1907_CX_木22,ルパンの娘,middle,['☆田中みな実演じる女泥棒が大暴れ!深田恭子と新旧ドロンジョ対決!!今夜もツッコミが追いつ...,,draw


In [0]:
# 空の列を足す
df_stage["epg_tknz"] = np.nan

In [129]:
mecab = MeCab.Tagger()
mecab.parse("")
for i, text in enumerate( df_stage['epg_joined']):
    text_tokenized = []

    # URL、記号などのゴミを取り除く
    if type(text) is not str: 
        if np.isnan(text) :
            continue 
    text = removeTrash(text)
    node = mecab.parseToNode(text)
    while node:
        node = node.next
        if node is None:
            continue

        if not node.feature.startswith("BOS/EOS") and not node.feature.startswith("助詞") and\
            not node.feature.startswith("記号") and\
            node.feature.find("人名") == -1 and\
            not node.feature.startswith("助動詞"):
            text_tokenized.append(node.surface)

    df_stage["epg_tknz"].iloc[i] = text_tokenized

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [130]:
df_stage["epg_tknz"]

0       [[', 都内, ある, 病院, 中庭, この, 病院, 勤務, する, 消化, 器, 内科...
1       [[', 典, 誉, 獅子, 雄, ディーン・フジオカ, 文句, 言っ, いる, 同居, し...
2       [[', ディーン・フジオカ, 典, いつも, よう, もめ, いる, 自分, スイーツ, ...
3       [[', 腕, いい, 建築, 士, メリット, ない, 考え, 結婚, 経験, さらに, ...
4       [[', かね, 鎌倉, 仏像, 愛し, やま, 羊, 泊, 日, 鎌倉, 旅行, 誘う, ...
                              ...                        
1906    [[', 鈴, 相, 武, 紗, 季, 会お, オリオン, 座, 忍び込ん, クロワッサン,...
1907    [[', 静岡, 県, 浜松, 市, コンサート, 趣旨, 賛同, し, 歌手, たち, 熱...
1908    [[', 人生, 先, 見え, 始め, 中年, 男性, 昔, 恋人, そっくり, 二, 十,...
1909    [[', 虫垂炎, 入院, し, 病院, 駆け付け, ユリ, 付き添う, 小, 百, 合, ...
1910    [[', 自分, ない, こと, ユリ, 知ら, れ, しまっ, うそ, つい, い, こと...
Name: epg_tknz, Length: 1911, dtype: object

In [131]:
df_stage["win_lose_dummy"] = np.nan
for i in range(len( df_stage["win_lose"] )):

    if df_stage["win_lose"].iloc[i] == "win":
        df_stage["win_lose_dummy"].iloc[i] = 1

    elif df_stage["win_lose"].iloc[i] == "draw":
        df_stage["win_lose_dummy"].iloc[i] = 0

    elif df_stage["win_lose"].iloc[i] == "lose":
        df_stage["win_lose_dummy"].iloc[i] = -1
        
    else :
        print("???")
    # df_stage["win_lose_dummy"] = 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


???
???
???


In [0]:
# 行列の中で1より大きい数値を１にならす
def bool_bow ( argX ) :

    # 各要素を見に行って２以上なら１補正
    for j in range(argX.shape[1]):
        tmp_sum = 0
        for i in range(argX.shape[0]):

            if argX[i, j] > 1:
                argX[i, j] = 1

    return argX


In [0]:
def check_rare_words ( argX , pre_vectorizer ) :
    for j in range(argX.shape[1]):
    
        for i in range(argX.shape[0]):

            if argX[i, j] > 1:
                argX[i, j] =   1
            
    tmp_df_sp = pd.DataFrame(argX.toarray(), columns=[ x[0] for x in sorted(pre_vectorizer.vocabulary_.items(), key=lambda x: x[1]) ])

    result_rare_words =[]

    for   tmp_word in tmp_df_sp.columns:
        tmp_sum = tmp_df_sp[tmp_word].sum()
        if  tmp_sum < 4:
            result_rare_words.append(tmp_word)

    return result_rare_words
    

In [0]:
# ステージごとに分けた後のdfを引数にとる

def fit_trans_lasso (df_tmp) :
    pre_vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
    Xpre = pre_vectorizer.fit_transform(
        [str(i) for i in df_tmp["epg_tknz"].values]
    )
    rare_words = check_rare_words(Xpre, pre_vectorizer)

    bow_vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")

    not_rare_words = []


    for tmp_word_list in  df_tmp["epg_tknz"].values:
        tmp_list = []
        for tmp_word in tmp_word_list:
        
            if tmp_word not in rare_words:
                tmp_list.append(tmp_word)
                
        not_rare_words.append(tmp_list)
    
    X = bow_vectorizer.fit_transform( 
        [str(i) for i in not_rare_words] 
    )
    X = bool_bow( X )

    # import pdb; pdb.set_trace()

    Y = df_tmp["win_lose_dummy"]

    scaler = StandardScaler(with_mean=False)
    clf = Lasso(alpha=0.1)

    # 標準化する必要はない??
    X = scaler.fit_transform(X)
    
    clf.fit(X, Y)

    result_df = pd.DataFrame(clf.coef_.T , index=bow_vectorizer.vocabulary_ )

    return result_df


In [0]:
class HorizontalDisplay:
    def __init__(self, *args):
        self.args = args

    def _repr_html_(self):
        template = '<div style="float: left; padding: 10px;">{0}</div>'
        return "\n".join(template.format(arg._repr_html_())
                         for arg in self.args)


In [0]:
#######
# early middle lateステージごとの結果を出す
#

def calc_lasso_by_stage (df_orijin):
    df_early =  df_orijin[df_orijin["stages"] == "early"]
    df_middle =  df_orijin[df_orijin["stages"] == "middle"]
    df_late =  df_orijin[df_orijin["stages"] == "late"]

    result_late = fit_trans_lasso (df_late) 
    result_early = fit_trans_lasso (df_early) 
    result_middle = fit_trans_lasso (df_middle) 

    import IPython.core.display as display
    import IPython.display

    import pprint

    # dfe = result_early[ result_early[0] != 0].sort_values(0)
    # dfm =result_middle[ result_middle[0] != 0].sort_values(0)
    dfl =result_late[ result_late[0] != 0].sort_values(0)

    print(dfl)
    # display.display(HorizontalDisplay(dfe, dfm, dfl))


In [137]:
calc_lasso_by_stage(df_stage[df_stage['drama_category'] =="love"]  )

              0
悪く    -0.130460
絢     -0.093200
プロポーズ -0.060112
距離    -0.047708
悩ん    -0.045098
別れる   -0.039704
抱く    -0.032011
意識    -0.023266
状態    -0.019019
来     -0.015937
謝罪    -0.015107
街     -0.013848
過ごす   -0.013373
働き    -0.010211
想い    -0.006108
好き    -0.002380
紗      0.002152
戸惑う    0.004658
体調     0.005511
会う     0.006802
直後     0.007836
命じ     0.009810
楽しみ    0.016965
送っ     0.017788
ny     0.019024
思わ     0.021590
描い     0.026098
key    0.030403
残さ     0.039145
しよ     0.043398
フローラル  0.052832
控え     0.069670
メゾン    0.078644
件      0.079300
誘わ     0.088717
顔      0.095800
突然     0.103858


In [138]:
calc_lasso_by_stage(df_stage[df_stage['drama_category'] =="police"]  )

             0
回    -0.076623
控え   -0.047192
ド    -0.044421
必ず   -0.043200
ー    -0.035122
潤    -0.033644
謎    -0.032057
サード  -0.031864
n    -0.029569
上げ   -0.020111
否認   -0.017288
こん   -0.016256
代わり  -0.015910
意識   -0.013967
記者   -0.013072
探す   -0.012958
持っ   -0.010396
手がかり -0.009878
不正   -0.007387
犯    -0.005158
cps  -0.002401
外さ   -0.001777
ネット  -0.000377
せる    0.001388
強盗    0.002898
頼ん    0.003338
会話    0.007672
せ     0.008861
決意    0.009615
無い    0.013501
平     0.013523
処分    0.019731
真実    0.023283
業務    0.030348
名     0.036962
ns    0.046322
係     0.068237


In [0]:
df_nan = df_stage[df_stage['drama_category'] =="nan"]
# win_lose_dummyにnanがあるので・・
df_nan = df_nan[ ~pd.isnull(df_nan['win_lose_dummy'])]

calc_lasso_by_stage(df_nan )



In [0]:

bow_vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
X = bow_vectorizer.fit_transform(
    [str(i) for i in df_stage["epg_tknz"].values]
)



In [0]:
X.toarray()


hoge_df = pd.DataFrame(X.toarray(), columns=[ x[0] for x in sorted(bow_vectorizer.vocabulary_.items(), key=lambda x: x[1]) ])

In [0]:
# hoge_df[hoge_df["ドクタージェジュン"]>1]["ドクタージェジュン"]
# hoge_df[hoge_df["ドクタージェジュン"]>1]["ドクタージェジュン"]
bow_vectorizer.vocabulary_.items()



In [0]:
#####################
#
#以下、授業の残り
#
####################



In [0]:
# ※これは、演習用に単語文書行列を DataFrame に変換して見やすくしてみるためのコードで、覚える必要はありません
#pd.DataFrame(X.toarray(), columns=[ x[0] for x in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]) ])