In [5]:
import numpy as np
import pandas as pd
import FinanceDataReader as fdr

In [6]:
# matplotlib rebuild

import matplotlib.pyplot as plt
from matplotlib import font_manager
%matplotlib inline


font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()

plt.rcParams["font.family"] = font_family
plt.rcParams["axes.grid"] = True
plt.rcParams["figure.figsize"] = (12,6)
plt.rcParams["axes.formatter.useoffset"] = False
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["axes.formatter.limits"] = -10000, 10000


In [7]:
df = pd.read_csv('./data/preprocessing/preprocessed/df_tokenized.csv',index_col=0)

#### 사업보고서 fillings_month 체크

In [None]:
rcp_month = [str(x)[4:6] for x in df.rcp_dt]
rcp_df = pd.DataFrame(rcp_month,columns={'발행월'})
hist = plt.hist(rcp_df.발행월,bins = None)
plt.show()

## wordcount

In [None]:
def word_count(ls):  #document의 word count
    ls = ls.split()
    counts = dict()
    for word in ls:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
            
    return counts


In [None]:
def total_word_count(srs): #total document의 word count
    ls = [x.split() for x in srs]
    counts = dict()
    for doc in ls:
        for word in doc:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
    return counts

In [None]:
tokens_dic = total_word_count(df.str_tokens)
nouns_dic = total_word_count(df.str_nouns)
tfd_tokens = pd.DataFrame(tokens_dic.items(),columns = ['단어','빈도']).sort_values('빈도',ascending=False)
tfd_nouns = pd.DataFrame(nouns_dic.items(),columns = ['단어','빈도']).sort_values('빈도',ascending=False)

### Wordcloud

In [None]:
from wordcloud import WordCloud #wordcloud 생성
from IPython.display import set_matplotlib_formats
import matplotlib

matplotlib.rc('font',family = 'Malgun Gothic')
set_matplotlib_formats('retina')
matplotlib.rc('axes',unicode_minus = False)
plt.rcParams["figure.figsize"] = (15,8)
wordcloud = WordCloud(font_path = 'C:/Windows/Fonts/malgun.ttf', background_color='white',colormap = "Accent_r",
            width=1500, height=1000).generate_from_frequencies(nouns_dic) #여기에 dic형태넣어주기

plt.imshow(wordcloud)
plt.axis('off')
plt.show;

## Document size check

In [None]:
print(df.shape)
print(tfd_tokens.shape)
print(tfd_nouns.shape)

dataset은 10년동안 kospi200에 계속 편입되어있던 113개의 기업의 발간 사업보고서들인 1117개의 Documents입니다.
Mecab으로 형태소분석을 했을 때 총 토큰들과 그 중 명사만 뽑아 term document freqeuncy를 빈도가 높은 순 대로 살펴보았는데 각각 14198, 9865개의 단어 수를 기록했습니다. 형태소분석 단어집합들은 숫자를 치환한 'num'와 조사가 빈도수가 가장 많았습니다.


다음으로는 조사와 num을 제외하여 단어 count를 살펴보고, tfd_nouns와 비교 후 이를 documnet filings 연도별로 매칭시켜 연도별 word counts의 frequency를 살펴보도록 하자.

In [None]:
df_count = df.copy()
df_count = df_count.loc[:,['연도','crp_nm','rpt_nm']]
word_count_ls = [len(word_count(x).items()) for x in df.str_tokens]
word_count_noun_ls = [len(word_count(x).items()) for x in df.str_nouns]
df_count['document_size'] = word_count_ls
df_count['document_noun_size'] = word_count_noun_ls

In [None]:
df_count.document_size.describe()

In [None]:
df_count = df_count.sort_values(by = '연도')
year_ls = [sum(df_count[df_count.연도 == x].document_size) for x in df_count.연도.unique()]
year_noun_ls = [sum(df_count[df_count.연도 == x].document_noun_size) for x in df_count.연도.unique()]
doc_size_by_year = pd.DataFrame(list(zip(df_count.연도.unique(),year_ls,year_noun_ls)),columns= {'year','document','noun'})
doc_size_by_year = doc_size_by_year.sort_values(by = 'year', ascending= True).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt

plt.bar(doc_size_by_year.year, doc_size_by_year.document)

plt.xlabel('year')
plt.ylabel('document size')
plt.show()

Similarity_Cohen_Malloy_Nguyen(2019)Lazy Prices의 내용과 마찬가지로 미국 뿐만 아니라 한국에서도 10년간 사업보고서 내의 단어 빈도가 꾸준히 증가하였다. 토큰화된 사업보고서의 텍스트를 기준으로 하였으며 명사로 봤을 때도 단조 증가하는 양상은 동일하였다.

## Document Similarity

In [2]:
def length_adjust(str_) : 
    return (2-len(str_)) * '0' + str_

def date_convertor(date) : 
    year = length_adjust(str(date.year))
    month = length_adjust(str(date.month))
    day = length_adjust(str(date.day))
    yyyymmdd = int(year+month+day)
    return yyyymmdd

In [None]:
# naive_df = df.copy()
# lv2_index = pd.date_range(start='20100101', end='20191231')
# lv2_index = list(map(date_convertor,lv2_index))
# lv2_columns = naive_df.crp_nm.unique()

# lv2_df = \
# pd.DataFrame(np.full(shape=(len(lv2_index),len(lv2_columns)),fill_value=np.nan),index=lv2_index,columns=lv2_columns)
# lv2_df.shape

In [8]:
def jaccard_sim(df):
    
    naive_df = df.copy()
    
    lv2_index = pd.date_range(start='20100101', end='20191231')
    lv2_index = list(map(date_convertor,lv2_index))
    lv2_columns = naive_df.crp_nm.unique()

    lv2_df = \
    pd.DataFrame(np.full(shape=(len(lv2_index),len(lv2_columns)),fill_value=np.nan),index=lv2_index,columns=lv2_columns)
    
    for process,corp_name in enumerate(lv2_columns) : 
        print("Process : {} | Total : {}".format(process+1,len(lv2_columns)),end='\r')
        # slicing the dataset by corp_name
        tmp_df = naive_df[naive_df.crp_nm == corp_name]

        # set-up the time range 
        min_yr = tmp_df['연도'].min() #2010
        max_yr = tmp_df['연도'].max() #2019

        # container for saving the value 
        # those lists should be refreshed every loop.
        sim_ls = []; year_ls = []

        for yr in range(min_yr,max_yr) : 
            pre_df = tmp_df[tmp_df['연도'] == yr]
            next_df = tmp_df[tmp_df['연도'] == yr+1]
            publish_date = tmp_df[tmp_df['연도'] == yr+1]['rcp_dt']
            if len(pre_df) == 0 or len(next_df) == 0:
                continue
                # we don't have data
                # for example, we have 2014 but don't 2015.
                # therfore we can't measure similarity between 2014,2015.
                # so the value in lv2 at 2015 should be nan

            else : 
                pre_token = pre_df.str_tokens.map(lambda x : x.split(" ")).values
                next_token = next_df.str_tokens.map(lambda x : x.split(" ")).values   
                if type(pre_token[0]) == list and type(next_token[0]) == list :
                    sim = len(set(next_token[0])&set(pre_token[0]))/len(set(next_token[0] + pre_token[0]))
                else : 
                    sim = len(set(next_token)&set(pre_token))/len(set(next_token + pre_token))
            year_ls.append(publish_date.values[0])
            sim_ls.append(sim)

        assert len(year_ls) == len(sim_ls)
        lv2_df.loc[year_ls,corp_name] = sim_ls    
        
    return lv2_df

In [14]:
def cosine_sim(df):
    
    
    naive_df = df.copy()
    lv2_index = pd.date_range(start='20100101', end='20191231')
    lv2_index = list(map(date_convertor,lv2_index))
    lv2_columns = naive_df.crp_nm.unique()

    lv2_df = \
    pd.DataFrame(np.full(shape=(len(lv2_index),len(lv2_columns)),fill_value=np.nan),index=lv2_index,columns=lv2_columns)
    
    for process,corp_name in enumerate(lv2_columns):
        print("Process : {} | Total : {}".format(process+1,len(lv2_columns)),end='\r')

        tmp_df = naive_df[naive_df.crp_nm == corp_name]

        min_yr = tmp_df['연도'].min() #2010
        max_yr = tmp_df['연도'].max() #2019

        tmp_tf = tfidf.fit_transform(tmp_df.str_tokens) #vectorize as matrix
        tmp_cosine = linear_kernel(tmp_tf,tmp_tf)

        sim_ls = []; year_ls = []

        for yr in range(min_yr,max_yr) :

            pre_df = tmp_df[tmp_df['연도'] == yr]
            next_df = tmp_df[tmp_df['연도'] == yr+1]

            publish_date = tmp_df[tmp_df['연도'] == yr+1]['rcp_dt']

            if len(pre_df) == 0 or len(next_df) ==0:
                continue

            else:
                concat_df = pd.concat([pre_df,next_df]) 
                co_vec = tfidf.fit_transform(concat_df.str_nouns)
                co_sim = linear_kernel(co_vec)[1][0]

            year_ls.append(publish_date.values[0])
            sim_ls.append(co_sim)

            assert len(year_ls) == len(sim_ls)
            lv2_df.loc[year_ls,corp_name] = sim_ls
    
    return lv2_df

In [20]:
def select_sim(df,tmp):
    
    
    naive_df = df.copy()
    lv2_index = pd.date_range(start='20100101', end='20191231')
    lv2_index = list(map(date_convertor,lv2_index))
    lv2_columns = naive_df.crp_nm.unique()

    lv2_df = \
    pd.DataFrame(np.full(shape=(len(lv2_index),len(lv2_columns)),fill_value=np.nan),index=lv2_index,columns=lv2_columns)
    
    for process,corp_name in enumerate(lv2_columns):
        print("Process : {} | Total : {}".format(process+1,len(lv2_columns)),end='\r')

        tmp_df = naive_df[naive_df.crp_nm == corp_name]

        min_yr = tmp_df['연도'].min() #2010
        max_yr = tmp_df['연도'].max() #2019

        tmp_tf = tfidf.fit_transform(tmp_df.str_tokens) #vectorize as matrix
        tmp_cosine = linear_kernel(tmp_tf,tmp_tf)

        sim_ls = []; year_ls = []

        for yr in range(min_yr,max_yr) :

            pre_df = tmp_df[tmp_df['연도'] == yr]
            next_df = tmp_df[tmp_df['연도'] == yr+1]

            publish_date = tmp_df[tmp_df['연도'] == yr+1]['rcp_dt']

            if len(pre_df) == 0 or len(next_df) ==0:
                continue

            else:
                if tmp == 1:
                    concat_df = pd.concat([pre_df,next_df]) 
                    co_vec = tfidf.fit_transform(concat_df.str_nouns)
                    sim = linear_kernel(co_vec)[1][0]
                    
                elif tmp ==2:
                    pre_token = pre_df.str_tokens.map(lambda x : x.split(" ")).values
                    next_token = next_df.str_tokens.map(lambda x : x.split(" ")).values   
                    if type(pre_token[0]) == list and type(next_token[0]) == list :
                        sim = len(set(next_token[0])&set(pre_token[0]))/len(set(next_token[0] + pre_token[0]))
                    else : 
                        sim = len(set(next_token)&set(pre_token))/len(set(next_token + pre_token))
                    

            year_ls.append(publish_date.values[0])
            sim_ls.append(sim)

            assert len(year_ls) == len(sim_ls)
            lv2_df.loc[year_ls,corp_name] = sim_ls
    
    return lv2_df

In [9]:
lv2_jaccard = jaccard_sim(df)
lv2_jaccard.mean()

Process : 113 | Total : 113

삼양홀딩스      0.576806
유한양행       0.716340
CJ대한통운     0.575064
두산         0.586907
대림산업       0.608455
             ...   
LF         0.829409
후성         0.728265
SK이노베이션    0.650399
CJ제일제당     0.683321
KB금융       0.745644
Length: 113, dtype: float64

In [10]:
lv2_jaccard.mean().mean()

0.6590837093668442

In [11]:
lv2_jaccard[lv2_jaccard.loc[:,'삼양홀딩스'].isnull() == False]

Unnamed: 0,삼양홀딩스,유한양행,CJ대한통운,두산,대림산업,한국테크놀로지그룹,기아차,동아쏘시오홀딩스,SK하이닉스,영풍,...,현대백화점,한국금융지주,GS,하나금융지주,아모레퍼시픽,LF,후성,SK이노베이션,CJ제일제당,KB금융
20110331,0.482558,,0.615385,0.579749,0.630503,0.203113,0.705706,,0.75,0.5762,...,0.873362,,,0.462889,0.577154,,0.633452,0.574394,0.755193,0.71663
20120330,0.349498,0.691877,0.500778,0.484272,0.604478,0.65097,0.40438,0.446479,0.527778,0.372517,...,,,0.692913,0.64977,0.55595,0.895184,0.592593,0.425806,0.579278,0.71066
20130401,0.541254,,,,0.714286,0.597826,0.536538,0.645907,,0.52589,...,0.72591,,,0.643997,0.551537,,0.533141,0.571237,0.730924,
20140331,0.623506,0.576659,0.528402,0.468793,0.643312,0.89899,0.697959,0.668571,0.963636,0.510526,...,0.763092,0.471028,0.881773,0.776367,0.604414,0.650526,0.604839,0.543253,0.754881,0.737799
20150331,0.688259,0.782927,0.616085,0.58971,0.665848,0.92446,0.677355,0.529118,0.202312,0.812367,...,0.640756,0.772939,0.130774,0.678119,0.5184,0.923913,0.818681,0.672179,0.678571,0.72449
20160330,0.632207,0.574423,0.579921,0.591931,0.571914,0.933852,0.791111,0.674319,0.317259,0.782341,...,0.726695,0.566667,0.724882,0.677509,0.487842,0.785276,0.794721,0.652893,0.599624,0.781513
20170331,0.643299,0.816705,0.541444,0.597531,0.629448,0.604712,0.692469,0.867362,0.585882,0.680357,...,0.729977,0.629002,0.743371,0.703704,0.556535,,0.890244,0.864389,0.637708,0.701692
20180402,0.540034,0.715789,0.59949,,,0.739583,0.646035,0.815789,0.321534,0.839187,...,0.652427,0.507442,0.578341,0.780149,0.638095,,0.828804,0.739049,,0.753463
20190401,0.690635,0.532609,0.727488,,0.510638,0.625402,0.6787,0.738854,,0.811252,...,0.758364,0.564672,0.671296,0.77672,0.543662,0.870103,0.857909,0.810392,0.677365,0.821839


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
from sklearn.metrics.pairwise import linear_kernel

In [23]:
lv2_jaccard.mean().mean()

0.6590837093668442

In [22]:
lv2_cosine.mean().mean()

0.8491577150668567

In [24]:
test1 = select_sim(df,1)

Process : 1 | Total : 113

NameError: name 'co_sim' is not defined

In [17]:
lv2_cosine[lv2_cosine.loc[:,'삼양홀딩스'].isnull() == False]

Unnamed: 0,삼양홀딩스,유한양행,CJ대한통운,두산,대림산업,한국테크놀로지그룹,기아차,동아쏘시오홀딩스,SK하이닉스,영풍,...,현대백화점,한국금융지주,GS,하나금융지주,아모레퍼시픽,LF,후성,SK이노베이션,CJ제일제당,KB금융
20110331,0.622024,,0.77316,0.912678,0.841253,0.431589,0.920595,,0.797388,0.806065,...,0.962223,,,0.748002,0.749037,,0.792693,0.789827,0.934788,0.950537
20120330,0.434998,0.873114,0.801449,0.842174,0.807316,0.853189,0.694075,0.540498,0.702372,0.631874,...,,,0.781712,0.963943,0.771763,0.910761,0.788836,0.485055,0.800814,0.954953
20130401,0.778899,,,,0.932009,0.626158,0.808872,0.839238,,0.769667,...,0.881104,,,0.948453,0.790564,,0.677898,0.788815,0.927363,
20140331,0.874241,0.885923,0.831608,0.759681,0.923337,0.953803,0.926286,0.769244,0.981531,0.731935,...,0.894385,0.62428,0.882901,0.952799,0.891831,0.537132,0.685722,0.844073,0.936276,0.97453
20150331,0.89352,0.929097,0.879566,0.949291,0.936706,0.958228,0.938524,0.655989,0.231396,0.95896,...,0.889942,0.954915,0.621349,0.928968,0.839272,0.983096,0.943338,0.953768,0.888349,0.971369
20160330,0.898578,0.80671,0.818457,0.95059,0.902032,0.975595,0.966084,0.864378,0.516489,0.92552,...,0.916792,0.818755,0.956766,0.957387,0.743325,0.895004,0.889216,0.922507,0.725712,0.979348
20170331,0.836248,0.963508,0.821509,0.953412,0.907106,0.751839,0.92539,0.960596,0.782483,0.913526,...,0.935313,0.899389,0.970278,0.951446,0.826123,,0.959064,0.973943,0.808266,0.970944
20180402,0.798907,0.878734,0.844829,,,0.92253,0.883183,0.927719,0.543521,0.934352,...,0.842583,0.813153,0.968133,0.966075,0.858522,,0.865815,0.96611,,0.96659
20190401,0.919519,0.791413,0.951014,,0.693943,0.856799,0.900001,0.89174,,0.869888,...,0.945849,0.899971,0.97463,0.988352,0.827661,0.902945,0.942776,0.989362,0.847765,0.97268


In [18]:
lv2_jaccard[lv2_jaccard.loc[:,'삼양홀딩스'].isnull() == False]

Unnamed: 0,삼양홀딩스,유한양행,CJ대한통운,두산,대림산업,한국테크놀로지그룹,기아차,동아쏘시오홀딩스,SK하이닉스,영풍,...,현대백화점,한국금융지주,GS,하나금융지주,아모레퍼시픽,LF,후성,SK이노베이션,CJ제일제당,KB금융
20110331,0.482558,,0.615385,0.579749,0.630503,0.203113,0.705706,,0.75,0.5762,...,0.873362,,,0.462889,0.577154,,0.633452,0.574394,0.755193,0.71663
20120330,0.349498,0.691877,0.500778,0.484272,0.604478,0.65097,0.40438,0.446479,0.527778,0.372517,...,,,0.692913,0.64977,0.55595,0.895184,0.592593,0.425806,0.579278,0.71066
20130401,0.541254,,,,0.714286,0.597826,0.536538,0.645907,,0.52589,...,0.72591,,,0.643997,0.551537,,0.533141,0.571237,0.730924,
20140331,0.623506,0.576659,0.528402,0.468793,0.643312,0.89899,0.697959,0.668571,0.963636,0.510526,...,0.763092,0.471028,0.881773,0.776367,0.604414,0.650526,0.604839,0.543253,0.754881,0.737799
20150331,0.688259,0.782927,0.616085,0.58971,0.665848,0.92446,0.677355,0.529118,0.202312,0.812367,...,0.640756,0.772939,0.130774,0.678119,0.5184,0.923913,0.818681,0.672179,0.678571,0.72449
20160330,0.632207,0.574423,0.579921,0.591931,0.571914,0.933852,0.791111,0.674319,0.317259,0.782341,...,0.726695,0.566667,0.724882,0.677509,0.487842,0.785276,0.794721,0.652893,0.599624,0.781513
20170331,0.643299,0.816705,0.541444,0.597531,0.629448,0.604712,0.692469,0.867362,0.585882,0.680357,...,0.729977,0.629002,0.743371,0.703704,0.556535,,0.890244,0.864389,0.637708,0.701692
20180402,0.540034,0.715789,0.59949,,,0.739583,0.646035,0.815789,0.321534,0.839187,...,0.652427,0.507442,0.578341,0.780149,0.638095,,0.828804,0.739049,,0.753463
20190401,0.690635,0.532609,0.727488,,0.510638,0.625402,0.6787,0.738854,,0.811252,...,0.758364,0.564672,0.671296,0.77672,0.543662,0.870103,0.857909,0.810392,0.677365,0.821839
