In [1]:
import pandas as pd 

In [2]:
# generate the monthly index
df = pd.DataFrame({
    'Month':pd.date_range(start="20170101", end="20221231", freq="M")
})
df['Month'] = pd.to_datetime(df['Month'].dt.year.astype(str)+"-"+df['Month'].dt.month.astype(str)+"-01")
df.head(2)

Unnamed: 0,Month
0,2017-01-01
1,2017-02-01


In [3]:
# 1. Calculate the total number of news for a day
total = pd.read_csv("total.csv",usecols=[0,2])
total.columns=['Date','total']
total['Date'] = pd.to_datetime(total['Date'])
total['Month'] = pd.to_datetime(total['Date'].dt.year.astype(str)+"-"+total['Date'].dt.month.astype(str)+"-01")

In [4]:
# 2. Financial Uncertainty
# 2.1 Find the news that contains ("bond" or "stock")+("uncertain") keywords in the full text. 
#   The file name is "stockbonduncertain.csv". (股债不确定.csv)
#   See the file "rmrb_download.ipynb" for the detailed code.
bs = pd.read_csv("stockbonduncertain.csv")
bs['Date'] = pd.to_datetime(bs['Date'])
bs['bsu'] = 1
bs = bs.groupby('Date',as_index=False).sum()
bs.head(2)

Unnamed: 0,Date,bsu
0,2016-01-06,1
1,2016-01-07,1


In [5]:
# 2.2 Merge the data frame about financial uncertainty news and the data frame about total news.
total = pd.merge(total,bs,on='Date',how='left').sort_values(by='Date')
total = total.fillna(0)
total['bsu'] = total['bsu']/total['total']
total.describe()

Unnamed: 0,total,bsu
count,2650.0,2650.0
mean,86.010189,0.001028
std,33.904273,0.004171
min,18.0,0.0
25%,57.0,0.0
50%,89.0,0.0
75%,111.0,0.0
max,157.0,0.071429


In [6]:
# 3. Economic Uncertainty
# 3.1 Find the news that contains ("enconomic")+("uncertain" or "risk") keywords in the full text. 
#   The file name is "economicuncertainty.csv". (经济风险或不确定.csv)
#   See the file "rmrb_download.ipynb" for the detailed code.
ec = pd.read_csv("economicuncertainty.csv")
ec['Date'] = pd.to_datetime(ec['Date'])
ec['eu'] = 1
ec = ec.groupby('Date',as_index=False).sum()
ec.head(2)

Unnamed: 0,Date,eu
0,2016-01-01,5
1,2016-01-02,2


In [7]:
# 3.2 Merge the data frame about economic uncertainty news and the data frame about total news.
total = pd.merge(total,ec,on='Date',how='left').sort_values(by='Date')
total = total.fillna(0)
total['eu'] = total['eu']/total['total']
total.describe()

Unnamed: 0,total,bsu,eu
count,2650.0,2650.0,2650.0
mean,86.010189,0.001028,0.065548
std,33.904273,0.004171,0.039106
min,18.0,0.0,0.0
25%,57.0,0.0,0.039604
50%,89.0,0.0,0.0625
75%,111.0,0.0,0.086871
max,157.0,0.071429,0.318182


In [8]:
# 4. COVID-19
# 4.1 Find the news that contains the COVID-19 keywords in the full text. 
# The file name is "COVID-19.csv". 
# See the file "rmrb_download.ipynb" for the detailed code.
covid = pd.read_csv("COVID-19.csv")
covid['Date'] = pd.to_datetime(covid['Date'])
covid['Covid_policy'] = covid['title'].apply(lambda x: str.find(x,"清")+str.find(x,"防")+str.find(x,"抗")+3)
covid = covid[covid['Covid_policy']>0]
covid['Covid_policy'] = 1
covid = covid.groupby('Date',as_index=False).sum()

In [9]:
# 4.2 Merge the data frame about COVID-19 news and the data frame about total news.
# Calculate the daily COVID-19 new frequency, ranging from 0 to 1.  
total = pd.merge(total,covid,on='Date',how='left').sort_values(by='Date')
total = total.fillna(0)
total['Covid_policy'] = total['Covid_policy']/total['total']
total.describe()

Unnamed: 0,total,bsu,eu,Covid_policy
count,2650.0,2650.0,2650.0,2650.0
mean,86.010189,0.001028,0.065548,0.015355
std,33.904273,0.004171,0.039106,0.040811
min,18.0,0.0,0.0,0.0
25%,57.0,0.0,0.039604,0.0
50%,89.0,0.0,0.0625,0.0
75%,111.0,0.0,0.086871,0.011111
max,157.0,0.071429,0.318182,0.466667


In [10]:
df = pd.merge(df,total.groupby('Month').mean(),on='Month')
df.to_csv("news_frequency.csv",encoding='utf-8-sig',index=None)