In [848]:
from pykrx import stock
import pandas as pd
import numpy as np
import tqdm
import json

In [111]:
def filtering_company(company):
    if pd.isna(company):
        return np.nan
    
    original = company
    
    # 긴 것 -> 작은 것(ex> 주식회사 -> 주)
    company = company.replace('(농업회사법인유한회사)', '')
    company = company.replace('의료법인)', '')
    company = company.replace('농업회사법인', '')
    company = company.replace('가부시키가이샤', '')
    company = company.replace('카부시키카이샤', '')
    company = company.replace('(유한회사)', '')
    company = company.replace('주식회사', '')
    company = company.replace('㈜', '')
    company = company.replace('(주)', '')
    company = company.replace('（주）', '')
    company = company.replace('( 주 )', '')
    company = company.replace('( 주)', '')
    company = company.replace('(주 )', '')
    company = company.replace('주)', '')
    company = company.replace('주 )', '')
    company = company.replace('(유)', '')
    company = company.replace('유)', '')
    company = company.replace('우)', '')
    company = company.replace('(사)', '')
    company = company.replace('사)', '')
    company = company.replace('  ', ' ') # 신정보개발주식회사 ~~
    company = company.split('|')[0] # 상위 -> 하위

    try:
        
        length = len(company)
        index = set(range(length))

        pairs1 = list()
        stack = list()
        for i in range(length):
            if company[i] == '(':
                stack.append(i)
            elif company[i] == ')':
                if stack:
                    pairs1.append((stack.pop(), i))
                else:
                    if pairs1:
                        pairs1.append((pairs1.pop()[0], i))
                    else:
                        pairs1.append((i, i))
        
        if stack:
            pairs1.append((stack[0], length-1))
        
        pairs2 = list()
        stack = list()
        for i in range(length):
            if company[i] == '[':
                stack.append(i)
            elif company[i] == ']':
                if stack:
                    pairs2.append((stack.pop(), i))
                else:
                    if pairs2:
                        pairs2.append((pairs2.pop()[0], i))
                    else:
                        pairs2.append((i, i))
                        
        if stack:
            pairs2.append((stack[0], length-1))
            
        for start, end in pairs1 + pairs2:
            index -= set(range(start, end+1))

        index = list(index)
        index.sort()
        company = ''.join([company[idx] for idx in index])
    
        company = company.strip()
    
    except:
        print(original, '--->', company)
        
    return company

# data collecting

### 데이터 수집

- KOSPI / KOSDAQ 연 종가 수익률 

In [18]:
kospi_y = stock.get_index_ohlcv("20020101", "20221231", "1001", 'y')
kosdaq_y = stock.get_index_ohlcv("20020101", "20221231", "2001", 'y')

kospi_y['연 수익률'] = (kospi_y['종가'] - kospi_y['종가'].shift(1)) / kospi_y['종가'].shift(1)
kospi_er = np.mean(kospi_y['연 수익률']) * 100
kosdaq_y['연 수익률'] = (kosdaq_y['종가'] - kosdaq_y['종가'].shift(1)) / kosdaq_y['종가'].shift(1)
kosdaq_er = np.mean(kosdaq_y['연 수익률']) * 100

- KOSPI / KOSDAQ 월 종가 수익률 (20개년)

In [19]:
kospi_m = stock.get_index_ohlcv("20120101", "20221231", "1001", 'm')
kosdaq_m = stock.get_index_ohlcv("20120101", "20221231", "2001", 'm')

kospi_m = kospi_m.reset_index(drop=False)[['날짜', '종가']]
kosdaq_m = kosdaq_m.reset_index(drop=False)[['날짜', '종가']]

In [None]:
pi_tickers = stock.get_market_ticker_list("20221231", market="KOSPI")
daq_tickers = stock.get_market_ticker_list("20221231", market="KOSDAQ")

- 상장기업 월 종가 수익률 (20개년)

In [21]:
all_df = pd.read_csv('./data/all_data_2022.csv')
dart_corp = all_df[all_df['상장분류']==True]
dart_corp_m = dart_corp[['회사명', '종목코드']]

In [None]:
import pickle
error_idx = []
df_list = []

for corp_code in tqdm.tqdm(dart_corp['종목코드']):
    try:
        corp_code_s = str(int(corp_code)).zfill(6)
        result = stock.get_market_ohlcv("20120101", "20221231", corp_code_s, 'm')['종가'].to_frame().reset_index()
        result['종목코드'] = corp_code
        # pd.merge(dart_corp_m, result, on='종목코드')
        df_list.append(result)
    except (KeyError, IndexError):
        print('error', corp_code)
        error_idx.append(corp_code)
with open("./data/list_result.pkl","wb") as f:
    pickle.dump(df_list, f)        
# 로컬에서 돌림

with open("./data/list_result.pkl","rb") as f:
    list_ex_load = pickle.load(f)
merged_df = pd.concat(list_ex_load, ignore_index=True)
merged_df = merged_df.drop_duplicates()
dart_m = merged_df

## Regression model

### Regression
- weight 값을 상장기업 levered beta 로 산정

In [None]:
dart_m['시장'] = ''
dart_m.loc[dart_m['종목코드'].apply(lambda x: str(int(x)).zfill(6) in pi_tickers) == True, '시장'] = 'KOSPI'
dart_m.loc[dart_m['종목코드'].apply(lambda x: str(int(x)).zfill(6) in daq_tickers) == True, '시장'] = 'KOSDAQ'
dart_m.loc[dart_m['종목코드'] == 344860, '시장'] = 'KOSDAQ'

kosdaq_m['시장'] = 'KOSDAQ'
kospi_m['시장'] = 'KOSPI'
kosdaq_m['er'] = kosdaq_er
kospi_m['er'] = kospi_er

dart_m = pd.merge(dart_m, kosdaq_m, on=['날짜', '시장'], how='left')
dart_m = pd.merge(dart_m, kospi_m, on=['날짜', '시장'], how='left')
dart_m['시장종가'] = dart_m['종가_y'].fillna(dart_m['종가'])
dart_m['시장ER'] = dart_m['er_y'].fillna(dart_m['er_x'])
dart_m = dart_m.drop(columns=['종가_y', '종가'], axis=1)
dart_m = dart_m.drop(columns=['er_y', 'er_x'], axis=1)
dart_m = dart_m.rename(columns={'종가_x':'종목종가'})

In [None]:
from sklearn.linear_model import LinearRegression
def get_weight(group):
    x = group['시장종가'].values.reshape(-1, 1)
    y = group['종목종가'].values
    model = LinearRegression()
    model.fit(x, y)
    return model.coef_[0]
temp = dart_m.groupby(['종목코드']).apply(get_weight).reset_index()
temp.columns = ['종목코드', 'levered_b']

temp_filtered = temp[(temp['levered_b'] < 2) & (temp['levered_b'] > -2)]
temp_filtered = temp_filtered[temp_filtered['levered_b'] != 0]

dart_m_er = dart_m[['종목코드', '시장ER']].drop_duplicates()
temp_filtered = pd.merge(temp_filtered, dart_m_er, on='종목코드')

## Rule base

### 상장기업 unlevered beta
- 상장기업 unlevered beta 구하기 (rule base)

In [None]:
all_df = pd.read_csv('./all_data_2022.csv')
dart_corp = all_df[all_df['상장분류']==True]
dart_corp = dart_corp[['회사명', '부채총액', '자본총계', '당기순이익', '종목코드']]
dart_corp = pd.merge(dart_corp, temp_filtered, on='종목코드')

conditions = [
    (dart_corp['당기순이익'] <= 2e+8),
    (dart_corp['당기순이익'] > 2e+8) & (dart_corp['당기순이익'] <= 2e+10),
    (dart_corp['당기순이익'] > 2e+10) & (dart_corp['당기순이익'] <= 3e+11),
    (dart_corp['당기순이익'] > 3e+11)
]
values = [0.09, 0.19, 0.21, 0.24]

dart_corp['법인세율'] = np.select(conditions, values, default='Other')
dart_corp = dart_corp.drop_duplicates('회사명').reset_index(drop=True)
dart_corp['unlevered_b'] = dart_corp['levered_b'] * (1 + (1-dart_corp['법인세율'].apply(float)) * dart_corp['부채총액'] / dart_corp['자본총계'])

temp_filtered = dart_corp[(dart_corp['levered_b'] < 2) & (dart_corp['levered_b'] > -2)]
temp_filtered = temp_filtered[temp_filtered['levered_b'] != 0]
temp_filtered[['회사명', '종목코드', 'unlevered_b', 'levered_b', '시장ER']].to_csv('./beta_상.csv', index=False)

### 비상장기업 levered beta 
- ordered corp 기반으로 unlevered beta 평균 구한 뒤 비상장기업 levered beta 구하기 (rule base)

In [None]:
mapped_df = pd.read_table('./company_mapping (1).tsv')
mapped_df = mapped_df.drop('Unnamed: 0', axis=1)

df = mapped_df[['회사명_비상장', '회사명_상장']]
df['회사명_상장'] = df['회사명_상장'].apply(lambda x : [data.strip()[1:-1] for data in x[1:-1].split(',')])
df = df.explode('회사명_상장').sort_values('회사명_비상장')

df['회사명_상장'] = df['회사명_상장'].apply(filtering_company)
df['회사명_비상장'] = df['회사명_비상장'].apply(filtering_company)

smes_corp = all_df[all_df['상장분류']==False]
smes_corp = smes_corp[['회사명', '부채총액', '자본총계', '당기순이익']]

conditions = [
    (smes_corp['당기순이익'] <= 2e+8),
    (smes_corp['당기순이익'] > 2e+8) & (smes_corp['당기순이익'] <= 2e+10),
    (smes_corp['당기순이익'] > 2e+10) & (smes_corp['당기순이익'] <= 3e+11),
    (smes_corp['당기순이익'] > 3e+11)
]
values = [0.09, 0.19, 0.21, 0.24]

smes_corp['법인세율'] = np.select(conditions, values, default='Other')
t = pd.read_csv('./beta_상.csv')[['회사명', 'unlevered_b', 'levered_b', '시장ER']]
t = t.rename(columns={'회사명':'회사명_상장', 'unlevered_b':'상장_unlevered_b', 'levered_b':'상장_levered_b', '시장ER':'상장_시장ER'})

df = pd.merge(df, t, on='회사명_상장')
df['m_상장_unlevered_b'] = df.groupby('회사명_비상장')['상장_unlevered_b'].transform('mean')
df = df.drop_duplicates(['회사명_비상장'])
df = df.reset_index(drop=True)

In [None]:
smes_corp = pd.merge(smes_corp, df, left_on = '회사명', right_on = '회사명_비상장')
smes_corp = smes_corp.drop_duplicates('회사명')
smes_corp = smes_corp.drop(['회사명_비상장', '회사명_상장'], axis=1)
smes_corp = smes_corp.reset_index(drop=True)
smes_corp['비상장_levered_b'] = smes_corp['m_상장_unlevered_b'] * (1+(1-smes_corp['법인세율'].apply(float)) * smes_corp['부채총액'] / smes_corp['자본총계'])
smes_corp[['회사명', '상장_levered_b', '비상장_levered_b']].to_csv('./beta_비.csv', index=False)
smes_corp['CAPM'] = 3.5 + smes_corp['비상장_levered_b'] * (smes_corp['상장_시장ER'] - 3.5)

def remove_outliers_iqr(df, column, k=1.5):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - k * IQR
    upper_bound = Q3 + k * IQR
    return df[((df[column] >= lower_bound) & (df[column] <= upper_bound))]

# IQR 이상치 제거 적용
smes_corp_f = remove_outliers_iqr(smes_corp, 'CAPM')
smes_corp_f.to_csv('./data/smes_capm_filtered_iqr.csv', index=False)

# data preprocessing
- latest year = 2022 & 3개년치 값을 모두 가지고 있는 비상장 기업의 데이터 이용하여, 추후 3개년 inference

<p>상장</p>

- 상장기업 4개년치에서 (3개년 : x, 1개년 : y) 로 잡은 뒤 학습
- model train input : 3개년 feature 8개 / output : 1개년 feature 8개

<p>비상장</p>

- model test input : 3개년 feature 8개 / output : 1개년 feature 8개
- 이를 3개 반복학습하여 3개년 feature 8개 추가 예측

### smes

In [1550]:
with open('./data/companies_info.json') as f:
    js = json.loads(f.read())
smes_company = pd.DataFrame(js).transpose()
smes_company = smes_company.reset_index().rename(columns={'index':'회사명'})
smes_company = smes_company.dropna()

In [1551]:
fixed_columns = ['회사명', 'year', '부채총계', '자본총계', '매출액', '당기순이익', '영업이익', '현금및현금성자산', '감가상각비']

In [1552]:
smes_company = smes_company.rename(columns={'liab':'부채총계', 'cash':'현금및현금성자산', 'equi':'자본총계', 'reve':'매출액', 'oper':'영업이익', 'depr':'감가상각비', 'neti':'당기순이익'})

fs = list(smes_company.columns)
fs = fs[4:]
smes_company = smes_company.explode(fs)

smes_company['year'] = smes_company['latest_year'].apply(int) - smes_company.groupby('회사명').cumcount()
smes_company = smes_company.drop(['sector', 'product'], axis=1).reset_index(drop=True)
smes_company['회사명'] = smes_company['회사명'].apply(filtering_company)
smes_company = smes_company.reindex(columns=fixed_columns)

filtering = ['부채총계', '현금및현금성자산', '자본총계', '매출액', '영업이익', '당기순이익']
smes_company = smes_company[(smes_company[filtering] != '0').all(axis=1)]
smes_company = smes_company[(smes_company[filtering] != '').all(axis=1)]

idx = list(smes_company.columns)
idx = idx[2:]
for i in idx:
    smes_company[i] = smes_company[i].str.replace(',','').astype(int)

column 추가 : 순부채, EBITDA, 기업총가치, EBITDA배율

In [1553]:
smes_company[f'순부채'] = smes_company[f'부채총계'] - smes_company[f'현금및현금성자산']
smes_company[f'EBITDA'] = smes_company[f'영업이익'] + smes_company[f'감가상각비']
smes_company[f'기업 가치'] = smes_company[f'자본총계'] + smes_company[f'순부채']
smes_company[f'EBITDA배율'] = smes_company[f'기업 가치'] / smes_company[f'EBITDA']

In [1554]:
columns = ['부채총계', '자본총계', '매출액', '당기순이익', '영업이익', '현금및현금성자산', '감가상각비']
def calculate_growth_rate(group):
    for c in columns:
        group[f'r_{c}'] = (group[c] - group[c].shift(1)) / group[c].shift(1)
    return group
smes_company_r = smes_company.groupby('회사명').apply(calculate_growth_rate)

In [1555]:
smes_company_r = smes_company_r.reset_index(drop=True)
smes_company_r['r_감가상각비'] = smes_company_r['r_감가상각비'].fillna(0)
smes_company_r = smes_company_r.dropna()
smes_company_r = smes_company_r.drop(columns=columns, axis=1)

In [1559]:
smes_company_r = smes_company_r.drop(['순부채', 'EBITDA', '기업 가치'], axis=1)
smes_company_r = smes_company_r.sort_values(['회사명', 'year'])

### dart

In [1566]:
dart = pd.read_csv('../data/final_df.csv')
dart = dart[['회사명', '부채총계', '자본총계', '매출액', '당기순이익', '영업이익', '현금및현금성자산', '감가상각비', 'year', 'report_code', '법인유형']]
dart['감가상각비'].fillna(0, inplace=True)
dart = dart.dropna()
dart[f'순부채'] = dart[f'부채총계'] - dart[f'현금및현금성자산']
dart[f'EBITDA'] = dart[f'영업이익'] + dart[f'감가상각비']

def check_years(group):
    return len(group['year'].unique()) >=4 and len(group['report_code']) >= 4
f_dart = dart.groupby('회사명').filter(check_years).reset_index(drop=True)

f_dart = f_dart.drop_duplicates(subset=['회사명', 'year', 'report_code', '법인유형'], keep='last')
f_dart_s = f_dart.sort_values(by=['회사명','year'])

  dart = pd.read_csv('../data/final_df.csv')


In [1711]:
f_dart_s4 = f_dart_s[f_dart_s['report_code'] == 11011].groupby('회사명').filter(lambda x:len(x)==4)

In [1568]:
f_dart_s5 = f_dart_s[f_dart_s['report_code'] == 11013].groupby('회사명').filter(lambda x:len(x)==5)

In [1712]:
fixed_columns = ['부채총계', '자본총계', '매출액', '당기순이익', '영업이익', '현금및현금성자산','감가상각비', '순부채', 'EBITDA']
fixed_columns_r = ['회사명', 'year', 'r_부채총계', 'r_자본총계', 'r_매출액', 'r_당기순이익', 'r_영업이익', 'r_현금및현금성자산', 'r_감가상각비']

In [1713]:
def calculate_growth_rate(group):
    for c in fixed_columns:
        group[f'r_{c}'] = (group[c] - group[c].shift(1)) / group[c].shift(1)
    return group
temp = f_dart_s4.groupby('회사명').apply(calculate_growth_rate)
# temp = f_dart_s5.groupby('회사명').apply(calculate_growth_rate)
temp = temp.reset_index(drop=True)
temp['r_감가상각비'] = temp['r_감가상각비'].fillna(0)
temp = temp.dropna()

In [1714]:
temp = temp.drop(columns=fixed_columns)
temp = temp.drop(columns=['r_순부채', 'r_EBITDA', 'report_code', '법인유형'], axis=1)
temp = temp.reindex(columns=fixed_columns_r)

In [1715]:
threshold = 10
# threshold = 5
temp = temp.drop(index=temp[temp.select_dtypes(include='float64').gt(threshold).any(axis=1)].index)

In [1233]:
d_columns = ['회사명', 'year', 'report_code', '법인유형']

# RNN

In [None]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [None]:
# 시퀀스 길이 (3개년치 데이터) -> (2개년치 성장치)
seq_length = 2

# RNN 입력 데이터로 변환하는 함수
def create_sequences(data, seq_length):
    sequences = []
    targets = []
    for name, group in data.groupby('회사명'):
        for i in range(len(group) - seq_length):
            sequence = group.iloc[i:i + seq_length].drop(['회사명', 'year'], axis=1).values
            target = group.iloc[i + seq_length].drop(['회사명', 'year']).values
            sequence = sequence.astype(float)
            target = target.astype(float)
            sequences.append(sequence)
            targets.append(target)
    return sequences, targets

# RNN 입력 데이터 생성
# data, target = create_sequences(f_dart_s4, seq_length)
data, target = create_sequences(temp, seq_length)

data = np.array(data)
target = np.array(target)

num_companies, num_years, num_features = data.shape

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = torch.tensor(self.data[index], dtype=torch.float32)
        y = torch.tensor(self.targets[index], dtype=torch.float32)
        return x, y

dataset = CustomDataset(data, target)

train_size = int(0.9 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

# 데이터 로더를 생성합니다.
batch_size = 1
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

In [None]:
# 모델 초기화
input_size = num_features
hidden_size = 16
output_size = num_features

model = RNNModel(input_size, hidden_size, output_size)

# 손실 함수 및 최적화 함수
criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 학습
num_epochs = 10000
best_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    
    model.eval()
    with torch.no_grad():
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    avg_v_loss = total_loss / len(valid_loader)
    
    if epoch % 100 == 0: 
        print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")
        print(f"Epoch [{epoch+1}/{num_epochs}], Average Valid Loss: {avg_v_loss:.4f}")
    
    if avg_v_loss < best_loss:
        best_loss = avg_v_loss

        # 모델과 optimizer의 상태를 저장합니다.
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_loss': best_loss
        }
        torch.save(checkpoint, './checkpoint_rnn_2.pth')

print("Training finished!")

Epoch [1/10000], Average Loss: 1.0074
Epoch [1/10000], Average Valid Loss: 9.7663
Epoch [101/10000], Average Loss: 0.7944
Epoch [101/10000], Average Valid Loss: 7.8971
Epoch [201/10000], Average Loss: 0.6294
Epoch [201/10000], Average Valid Loss: 6.5026
Epoch [301/10000], Average Loss: 0.4556
Epoch [301/10000], Average Valid Loss: 5.0407
Epoch [401/10000], Average Loss: 0.3090
Epoch [401/10000], Average Valid Loss: 3.8222
Epoch [501/10000], Average Loss: 0.2124
Epoch [501/10000], Average Valid Loss: 3.0169
Epoch [601/10000], Average Loss: 0.1472
Epoch [601/10000], Average Valid Loss: 2.5101
Epoch [701/10000], Average Loss: 0.1045
Epoch [701/10000], Average Valid Loss: 2.2128
Epoch [801/10000], Average Loss: 0.0758
Epoch [801/10000], Average Valid Loss: 2.0618
Epoch [901/10000], Average Loss: 0.0552
Epoch [901/10000], Average Valid Loss: 2.0011
Epoch [1001/10000], Average Loss: 0.0407
Epoch [1001/10000], Average Valid Loss: 1.9454
Epoch [1101/10000], Average Loss: 0.0314
Epoch [1101/100

In [None]:
gs = smes_company_r.groupby(['회사명']).size()
smes_company_r_filter = smes_company_r[smes_company_r['회사명'].isin(gs[gs==2].index)]
smes_company_r_filter = smes_company_r_filter.sort_values(['회사명', 'year'])
smes_company_r_filter = smes_company_r_filter.drop(['EBITDA배율'], axis=1)

In [None]:
# TEST
seq_length = 2
output_list = list()

model.eval()
for name, group in smes_company_r_filter.groupby('회사명'):
    # for i in range(len(group) - seq_length):
    with torch.no_grad():
        corp_name = group.iloc[0]['회사명']
        corp_year = group.iloc[0]['year'] + 2
        inputs = torch.tensor(group.iloc[0:0+seq_length].drop(['회사명', 'year'], axis=1).values, dtype=torch.float32)
        inputs = torch.reshape(inputs, (1, inputs.size(0), inputs.size(1)))
        outputs = model(inputs)
        output_list.append([corp_name, corp_year, outputs.tolist()])

In [None]:
test_output = pd.DataFrame(output_list)
final_output = pd.concat([test_output.loc[:,:1], pd.DataFrame([value for value in test_output[2].apply(np.squeeze).values])], axis = 1)
final_output.columns = temp.columns
final_output = pd.merge(final_output, smes_company, on=['회사명', 'year'])

# LSTM

In [1700]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [1701]:
torch.manual_seed(42)
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [1717]:
# 시퀀스 길이 (3개년치 데이터)
# seq_length = 3
seq_length = 2

# LSTM 입력 데이터로 변환하는 함수
def create_sequences(data, seq_length):
    sequences = []
    targets = []
    for name, group in data.groupby('회사명'):
        for i in range(len(group) - seq_length):
            sequence = group.iloc[i:i + seq_length].drop(['회사명', 'year'], axis=1).values
            target = group.iloc[i + seq_length].drop(['회사명', 'year']).values
            sequence = sequence.astype(float)
            target = target.astype(float)
            sequences.append(sequence)
            targets.append(target)
    return sequences, targets

# LSTM 입력 데이터 생성
# data, target = create_sequences(f_dart_s4, seq_length)
data, target = create_sequences(temp, seq_length)

In [1718]:
data = np.array(data)
target = np.array(target)

num_companies, num_years, num_features = data.shape

In [1239]:
# 데이터 스케일링
from sklearn.preprocessing import RobustScaler

data_reshaped = data.reshape((num_companies * num_years, num_features))

scalers = {}
for i in range(num_features):
    # feature_scaler = MinMaxScaler()
    feature_scaler = RobustScaler()
    data_reshaped[:, i] = feature_scaler.fit_transform(data_reshaped[:, i].reshape(-1, 1)).flatten()
    scalers[i] = feature_scaler

# 데이터의 차원을 다시 (3136, 3, 10)으로 복원합니다.
data_scaled = data_reshaped.reshape((num_companies, num_years, num_features))

# 타겟 데이터를 스케일링합니다.
# target_scaler = MinMaxScaler()
target_scaler = RobustScaler()
target_scaled = target_scaler.fit_transform(target)

In [1720]:
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = torch.tensor(self.data[index], dtype=torch.float32)
        y = torch.tensor(self.targets[index], dtype=torch.float32)
        return x, y

dataset = CustomDataset(data, target)

train_size = int(0.9 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

In [1721]:
# 모델 초기화
input_size = num_features
hidden_size = 16
num_layers = 1
output_size = num_features

model = LSTMModel(input_size, hidden_size, num_layers, output_size)

# 손실 함수 및 최적화 함수
criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 학습
num_epochs = 10000
best_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    
    model.eval()
    with torch.no_grad():
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    avg_v_loss = total_loss / len(valid_loader)
    
    if epoch % 100 == 0: 
        print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")
        print(f"Epoch [{epoch+1}/{num_epochs}], Average Valid Loss: {avg_v_loss:.4f}")
    
    if avg_v_loss < best_loss:
        best_loss = avg_v_loss

        # 모델과 optimizer의 상태를 저장합니다.
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_loss': best_loss
        }
        torch.save(checkpoint, './checkpoint_4.pth')

print("Training finished!")

Epoch [1/10000], Average Loss: 1.0786
Epoch [1/10000], Average Valid Loss: 9.9747
Epoch [101/10000], Average Loss: 0.9038
Epoch [101/10000], Average Valid Loss: 8.4113
Epoch [201/10000], Average Loss: 0.8562
Epoch [201/10000], Average Valid Loss: 8.0034
Epoch [301/10000], Average Loss: 0.8194
Epoch [301/10000], Average Valid Loss: 7.7002
Epoch [401/10000], Average Loss: 0.7970
Epoch [401/10000], Average Valid Loss: 7.5212
Epoch [501/10000], Average Loss: 0.7752
Epoch [501/10000], Average Valid Loss: 7.3477
Epoch [601/10000], Average Loss: 0.7533
Epoch [601/10000], Average Valid Loss: 7.1729
Epoch [701/10000], Average Loss: 0.7450
Epoch [701/10000], Average Valid Loss: 7.1133
Epoch [801/10000], Average Loss: 0.7221
Epoch [801/10000], Average Valid Loss: 6.9309
Epoch [901/10000], Average Loss: 0.7076
Epoch [901/10000], Average Valid Loss: 6.8195
Epoch [1001/10000], Average Loss: 0.6914
Epoch [1001/10000], Average Valid Loss: 6.6947
Epoch [1101/10000], Average Loss: 0.6875
Epoch [1101/100

In [1722]:
gs = smes_company_r.groupby(['회사명']).size()
smes_company_r_filter = smes_company_r[smes_company_r['회사명'].isin(gs[gs==2].index)]
smes_company_r_filter = smes_company_r_filter.sort_values(['회사명', 'year'])
smes_company_r_filter = smes_company_r_filter.drop(['EBITDA배율'], axis=1)

In [1726]:
# TEST
seq_length = 2
output_list = list()

model.eval()
for name, group in smes_company_r_filter.groupby('회사명'):
    # for i in range(len(group) - seq_length):
    with torch.no_grad():
        corp_name = group.iloc[0]['회사명']
        corp_year = group.iloc[0]['year'] + 2
        inputs = torch.tensor(group.iloc[0:0+seq_length].drop(['회사명', 'year'], axis=1).values, dtype=torch.float32)
        inputs = torch.reshape(inputs, (1, inputs.size(0), inputs.size(1)))
        outputs = model(inputs)
        output_list.append([corp_name, corp_year, outputs.tolist()])

In [1728]:
test_output = pd.DataFrame(output_list)
final_output = pd.concat([test_output.loc[:,:1], pd.DataFrame([value for value in test_output[2].apply(np.squeeze).values])], axis = 1)
final_output.columns = temp.columns
final_output = pd.merge(final_output, smes_company, on=['회사명', 'year'])

# Rule base
- 최종 DCF 모델 적용

In [1733]:
fcolumns = ['부채총계', '자본총계', '매출액', '당기순이익', '영업이익', '현금및현금성자산', '감가상각비']
for t in fcolumns:
    final_output[f'p_{t}'] = final_output[f'r_{t}'] * final_output[t]
final_output['p_EBITDA'] = final_output['p_영업이익'] + final_output['p_감가상각비']

conditions = [
    (final_output['p_매출액'] <= 2e+8),
    (final_output['p_매출액'] > 2e+8) & (final_output['p_매출액'] <= 2e+10),
    (final_output['p_매출액'] > 2e+10) & (final_output['p_매출액'] <= 3e+11),
    (final_output['p_매출액'] > 3e+11)
]
values = [0.09, 0.19, 0.21, 0.24]

final_output['p_법인세율'] = np.select(conditions, values, default='Other')
final_output['p_영업이익_t'] = final_output['p_영업이익'] * (1 - final_output['p_법인세율'].apply(float))
final_output['p_현금흐름'] = (final_output['p_영업이익_t'] + final_output['p_감가상각비']) * final_output['EBITDA배율']
p_output = final_output[['회사명', 'p_현금흐름']]

In [1740]:
smes_capm = pd.read_csv('./smes_capm_filtered_iqr.csv')
smes_capm = smes_capm[['회사명', 'CAPM']]
smes_capm = smes_capm.drop_duplicates('회사명')
smes_capm = pd.merge(smes_capm, p_output, on='회사명')
smes_capm['기업 가치'] = smes_capm['p_현금흐름'] / smes_capm['CAPM']
smes_capm[['회사명', '기업 가치']].to_csv('./8_pred.csv', index=False)