In [2]:
import pandas as pd
import numpy as np

# Load lv1

토큰화한 텍스트를 불러오는데 2020년에 사업보고서가 나온 기업(rcp_dt = 2020..)이 존재하여 제거해주었다.

In [3]:
df = pd.read_csv("./data/preprocessing/preprocessed/df_tokenized.csv",index_col=0)

# Filter the dataset
- There are several types of report such as restatement and first published version
- For simplifying the task, I'm going to use '사업보고서' only.  

In [4]:
naive_df = df.copy()
# naive_df = df[df.rpt_nm.map(lambda x : x.split(" ")[0]) == '사업보고서']
# naive_df.sort_values("연도",inplace=True)

# Set lv2

In [5]:
def length_adjust(str_) : 
    return (2-len(str_)) * '0' + str_

def date_convertor(date) : 
    year = length_adjust(str(date.year))
    month = length_adjust(str(date.month))
    day = length_adjust(str(date.day))
    yyyymmdd = int(year+month+day)
    return yyyymmdd

In [6]:
lv2_index = pd.date_range(start='20100101', end='20191231')
lv2_index = list(map(date_convertor,lv2_index))
lv2_columns = naive_df.crp_nm.unique()

lv2_df = \
pd.DataFrame(np.full(shape=(len(lv2_index),len(lv2_columns)),fill_value=np.nan),index=lv2_index,columns=lv2_columns)

In [7]:
lv2_df.shape
# (date_list,company_list)

(3652, 113)

# Generate the lv2

In [8]:
%%time
company_list = naive_df.crp_nm.unique()
for process,corp_name in enumerate(company_list) : 
    print("Process : {} | Total : {}".format(process+1,len(company_list)),end='\r')
    # slicing the dataset by corp_name
    tmp_df = naive_df[naive_df.crp_nm == corp_name]
    
    # set-up the time range 
    min_yr = tmp_df['연도'].min() #2010
    max_yr = tmp_df['연도'].max() #2019
    
    # container for saving the value 
    # those lists should be refreshed every loop.
    sim_ls = []; year_ls = []
    
    for yr in range(min_yr,max_yr) : 
        pre_df = tmp_df[tmp_df['연도'] == yr]
        next_df = tmp_df[tmp_df['연도'] == yr+1]
        publish_date = tmp_df[tmp_df['연도'] == yr+1]['rcp_dt']
        if len(pre_df) == 0 or len(next_df) == 0:
            continue
            # we don't have data
            # for example, we have 2014 but don't 2015.
            # therfore we can't measure similarity between 2014,2015.
            # so the value in lv2 at 2015 should be nan
            
        else : 
            pre_token = pre_df.str_tokens.map(lambda x : x.split(" ")).values
            next_token = next_df.str_tokens.map(lambda x : x.split(" ")).values   
            if type(pre_token[0]) == list and type(next_token[0]) == list :
                sim = len(set(next_token[0])&set(pre_token[0]))/len(set(next_token[0]))
            else : 
                 sim = len(set(next_token)&set(pre_token))/len(set(next_token))
        year_ls.append(publish_date.values[0])
        sim_ls.append(sim)
        
    assert len(year_ls) == len(sim_ls)
    lv2_df.loc[year_ls,corp_name] = sim_ls    

Wall time: 10.1 sotal : 113


In [11]:
lv2_df[lv2_df.loc[:,'삼양홀딩스'].isnull() == False]

Unnamed: 0,삼양홀딩스,유한양행,CJ대한통운,두산,대림산업,한국테크놀로지그룹,기아차,동아쏘시오홀딩스,SK하이닉스,영풍,...,현대백화점,한국금융지주,GS,하나금융지주,아모레퍼시픽,LF,후성,SK이노베이션,CJ제일제당,KB금융
20110331,0.608059,,0.783217,0.754959,0.856838,0.887755,0.824561,,0.931034,0.854489,...,0.954654,,,0.527753,0.627451,,0.770563,0.734513,0.922101,0.804668
20120330,0.391386,0.771875,0.600746,0.625,0.667216,0.778146,0.706633,0.718821,0.730769,0.444664,...,,,0.972376,0.806482,0.7506,0.946108,0.673684,0.50536,0.812775,0.803674
20130401,0.82,,,,0.81876,0.769231,0.685504,0.75,,0.743707,...,0.952247,,,0.843784,0.69161,,0.748988,0.823643,0.892157,
20140331,0.754217,0.668435,0.823045,0.539075,0.763994,0.960432,0.804706,0.895408,0.981481,0.686321,...,0.871795,0.90161,0.886139,0.873626,0.706349,0.691275,0.642857,0.572993,0.86783,0.838955
20150331,0.811456,0.90678,0.701209,0.773356,0.779856,1.0,0.820388,0.603416,0.227273,0.894366,...,0.709302,0.855422,0.130859,0.792812,0.72809,0.97032,0.955128,0.770346,0.768539,0.830851
20160330,0.791045,0.690176,0.711507,0.715428,0.728716,1.0,0.903553,0.734375,0.342466,0.861991,...,0.890909,0.825607,0.790591,0.848661,0.601124,0.882759,0.903333,0.787375,0.785714,0.864669
20170331,0.789873,0.911917,0.755597,0.724009,0.807874,0.619303,0.79759,0.949429,0.805825,0.763527,...,0.859838,0.81068,0.809696,0.833926,0.720532,,0.9125,0.929844,0.71875,0.835631
20180402,0.62279,0.792541,0.654596,,,0.768398,0.766055,0.887122,0.37138,0.915323,...,0.7,0.534188,0.844549,0.886061,0.794466,,0.864023,0.819215,,0.853556
20190401,0.822709,0.614695,0.82973,,0.759036,0.708561,0.761134,0.788043,,0.890438,...,0.875536,0.636561,0.827389,0.859485,0.654237,0.929515,0.941176,0.873281,0.727768,0.906977


In [98]:
def jaccard_sim(df):
    
    naive_df = df
    
    company_list = naive_df.crp_nm.unique()
    for process,corp_name in enumerate(company_list) : 
        print("Process : {} | Total : {}".format(process+1,len(company_list)),end='\r')
        # slicing the dataset by corp_name
        tmp_df = naive_df[naive_df.crp_nm == corp_name]

        # set-up the time range 
        min_yr = tmp_df['연도'].min() #2010
        max_yr = tmp_df['연도'].max() #2019

        # container for saving the value 
        # those lists should be refreshed every loop.
        sim_ls = []; year_ls = []

        for yr in range(min_yr,max_yr) : 
            pre_df = tmp_df[tmp_df['연도'] == yr]
            next_df = tmp_df[tmp_df['연도'] == yr+1]
            publish_date = tmp_df[tmp_df['연도'] == yr+1]['rcp_dt']
            if len(pre_df) == 0 or len(next_df) == 0:
                continue
                # we don't have data
                # for example, we have 2014 but don't 2015.
                # therfore we can't measure similarity between 2014,2015.
                # so the value in lv2 at 2015 should be nan

            else : 
                pre_token = pre_df.str_tokens.map(lambda x : x.split(" ")).values
                next_token = next_df.str_tokens.map(lambda x : x.split(" ")).values   
                if type(pre_token[0]) == list and type(next_token[0]) == list :
                    sim = len(set(next_token[0])&set(pre_token[0]))/len(set(next_token[0] + pre_token[0]))
                else : 
                    sim = len(set(next_token)&set(pre_token))/len(set(next_token + pre_token))
            year_ls.append(publish_date.values[0])
            sim_ls.append(sim)

        assert len(year_ls) == len(sim_ls)
        lv2_df.loc[year_ls,corp_name] = sim_ls    
        
    return lv2_df

In [99]:
jaccard = jaccard_sim(df)

Process : 113 | Total : 113

In [102]:
jaccard.mean().mean()

0.6590837093668442

In [41]:
def sim_simple(doc1,doc2):
    size_doc1 = len(set(doc1))
    size_doc2 = len(set(doc2))
    additions = len(set(doc2) - set(doc1))
    deletions = len(set(doc1) - set(doc2))
    changes = additions + deletions
    
    c = (additions + deletions + changes) / (size_doc1 + size_doc2)
    sim_simple = c
    return sim_simple

In [42]:
sim_simple(k1[0],k1[1])

0.3659742828882295

# Check the lv2
- if we apply mean operation through time, we can take overall auto-regressive similarity per company.

In [43]:
lv2_df

Unnamed: 0,삼양홀딩스,유한양행,CJ대한통운,두산,대림산업,한국테크놀로지그룹,기아차,동아쏘시오홀딩스,SK하이닉스,영풍,...,현대백화점,한국금융지주,GS,하나금융지주,아모레퍼시픽,LF,후성,SK이노베이션,CJ제일제당,KB금융
20100101,,,,,,,,,,,...,,,,,,,,,,
20100102,,,,,,,,,,,...,,,,,,,,,,
20100103,,,,,,,,,,,...,,,,,,,,,,
20100104,,,,,,,,,,,...,,,,,,,,,,
20100105,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20191227,,,,,,,,,,,...,,,,,,,,,,
20191228,,,,,,,,,,,...,,,,,,,,,,
20191229,,,,,,,,,,,...,,,,,,,,,,
20191230,,,,,,,,,,,...,,,,,,,,,,


In [12]:
lv2_df.mean()

삼양홀딩스      0.712393
유한양행       0.803717
CJ대한통운     0.714659
두산         0.730414
대림산업       0.752889
             ...   
LF         0.896107
후성         0.823584
SK이노베이션    0.757397
CJ제일제당     0.820028
KB금융       0.843536
Length: 113, dtype: float64