In [24]:
import pandas as pd
import numpy as np
from scipy import stats

import os

## Variable Construction

(1) $acc = [(\Delta CA − \Delta CASH) − (\Delta CL −  \Delta STD − \Delta TP) − Dep]/Total Assets$

CA, CASH, CL, STD, TP, Dep, denote current assets, cash/cash equivalents, current liabilities, debt included in current liabilities, income tax payable, depreciation and amortization expense, respectively.

(2) $absacc$: Absolute value of $acc$

(3) $agr$: Annual percentage change in total assets.

In [2]:
data_path = '/Users/ryan/Documents/GitHub/Finance-Empirical-Studies/Machine Learning in the Chinese Stock Market/Data'

# Read data
file_path = 'FI_T6/FI_T6.csv'
FI_t6 = pd.read_csv(os.path.join(data_path, file_path))

file_path = 'FS_Combas/FS_Combas.csv'
FS_combas = pd.read_csv(os.path.join(data_path, file_path))

# Process data
FI_t6 = FI_t6[FI_t6['Typrep'] == 'A']
FI_t6.rename(columns = {'F061201B': 'Dep'}, inplace=True)

FS_combas = FS_combas[FS_combas['Typrep'] == 'A']
FS_combas.rename(columns = {'A0b1103000': 'CASH', 'A001100000': 'CA', 'A001000000': 'TA', 'A001124000': 'STD', 'A002113000': 'TP', 'A002100000': 'CL'},
                    inplace=True)

# Calculate characteristics
data = pd.merge(FI_t6[['Stkcd', 'Accper', 'Dep']], 
                FS_combas[['Stkcd', 'Accper', 'CASH', 'STD', 'CA', 'TA', 'TP', 'CL']],
                on = ['Stkcd', 'Accper'],
                how = 'left')

data.fillna(0, inplace=True)
#! Needs interpretation here: Can I use 0 to simply replace the null?

data['acc'] = ((data['CA'] - data['CA'].shift(1))-
                (data['CASH'] - data['CASH'].shift(1))-
                ((data['CL'] - data['CL'].shift(1))-
                (data['STD'] - data['STD'].shift(1))-
                (data['TP'] - data['TP'].shift(1)))-
                data['Dep'])/\
                data['TA']

data['absacc'] = abs(data['acc'])

data['agr'] = (data['TA'] - data['TA'].shift(1)) / data['TA'].shift(1)

(4) $beta$: We estimate stock-level beta using weekly returns and value-weighted market returns for three years ending month $t − 1$ with at least 52 weeks of returns.

In [21]:
# read files
file_path = 'TRD_Weekm/TRD_Weekm.csv'
TRD_m = pd.read_csv(os.path.join(data_path, file_path))

# # Combine all data files
# file_list = ['TRD_Week/TRD_Week.csv', 'TRD_Week/TRD_Week1.csv', 'TRD_Week/TRD_Week2.csv']
# # Then delete the .csv file
# TRD = pd.DataFrame()
# for inputfile in file_list:
#     f = open(os.path.join(data_path, inputfile))
#     df_c = pd.read_csv(f)
#     df_m = pd.DataFrame(df_c)
#     TRD = pd.concat([TRD, df_m])

# TRD.reset_index(inplace=True)
# TRD.to_feather(os.path.join(data_path, 'TRD_Week/TRD_Week.ftr'))

file_path = 'TRD_Week/TRD_Week.ftr'
TRD = pd.read_feather(os.path.join(data_path, file_path))

# calculate stock return
TRD['returns'] = TRD['Wclsprc']/TRD['Wopnprc'] - 1

# merge two dataset
TRD_m = TRD_m[TRD_m['Markettype'] == 53]
TRD = pd.merge(TRD, TRD_m[['Trdwnt', 'Cwretmdos']], on='Trdwnt', how='left')

# datetime processing
TRD['Trdwnt'] = TRD['Trdwnt'].str[:4]+TRD['Trdwnt'].str[5:]
TRD['Trdwnt'].astype(int)
TRD['LastDayWeek'] = pd.to_datetime((TRD['Trdwnt']).astype(str)+'6', format="%Y%U%w")
TRD['year'] = TRD['LastDayWeek'].dt.year
TRD['month'] = TRD['LastDayWeek'].dt.month
TRD['YearMonth'] = TRD['year']*100+TRD['month']

# calculate beta
def cal_beta(stock,market):
    beta, alpha, r_value, p_value, std_err = stats.linregress(market, stock)
    return beta
Beta = TRD.groupby(['Stkcd', 'YearMonth']).apply(lambda x: cal_beta(x.returns, x.Cwretmdos))

(60) $pctacc$: Change the denominator with the value of net income. If $NetInc = 0$, then, take $NetInc=0.01$

In [5]:
# Read data
file_path = 'FS_Comins/FS_Comins.csv'
FS_comins = pd.read_csv(os.path.join(data_path, file_path))

# Process data
FS_comins = FS_comins[FS_comins['Typrep'] == 'A']
FS_comins.rename(columns = {'B002000000': 'NetInc'}, inplace=True)

# Calculate the characteristic
data = pd.merge(data, FS_comins[['Stkcd', 'Accper', 'NetInc']],
                on = ['Stkcd', 'Accper'],
                how = 'left')
    
data['pctacc'] = ((data['CA'] - data['CA'].shift(1))-
                (data['CASH'] - data['CASH'].shift(1))-
                ((data['CL'] - data['CL'].shift(1))-
                (data['STD'] - data['STD'].shift(1))-
                (data['TP'] - data['TP'].shift(1)))-
                data['Dep'])/\
                data['NetInc']      # Using the same formula as above (except changing the denominator)