# Aggregating Financial News Data

### Audhi Aprilliant

## 1 Import Libraries 

In [18]:
import pandas as pd             # Dataframe manipulation
import numpy as np              # Mathematics operation
import re                       # Regular expression
from datetime import datetime   # Date time manipulation

## 2 Import Data

In [19]:
data_okezone = pd.read_csv('Datasets/1 Okezone 2019.csv')  # Okezone
data_kompas = pd.read_csv('Datasets/2 Kompas 2019.csv')    # Kompas
data_detik = pd.read_csv('Datasets/3 Detik 2019.csv')      # Detik

In [20]:
data_okezone.head()

Unnamed: 0,date,title,time,class
0,01 Januari 2019,5 Alasan Pilih Kos-kosan Sebagai Resolusi Bisn...,20:37 WIB,Neutral
1,01 Januari 2019,"Dulu Rp100.000/Liter, Kini 131 Wilayah Nikmati...",18:18 WIB,Positive
2,01 Januari 2019,"Realisasi Anggaran KESDM di Atas 87%, Terbesar...",18:18 WIB,Positive
3,01 Januari 2019,Making Indonesia 4.0 Wujudkan Perekonomian Ter...,16:23 WIB,Positive
4,01 Januari 2019,Ekspor PT Timah Tembus 33.250 M/T pada 2018,16:17 WIB,Positive


In [21]:
print('Dimension of Okezone data: {}'.format(data_okezone.shape[0]))
print('Dimension of Kompas data:  {}'.format(data_kompas.shape[0]))
print('Dimension of Detik data:   {}'.format(data_kompas.shape[0]))

Dimension of Okezone data: 7753
Dimension of Kompas data:  3936
Dimension of Detik data:   3936


In [22]:
print('Number of class in Okezone data:\n{}'.format(data_okezone['class'].value_counts()))
print('\nNumber of class in Kompas data:\n{}'.format(data_kompas['class'].value_counts()))
print('\nNumber of class in Detik data:\n{}'.format(data_detik['class'].value_counts()))

Number of class in Okezone data:
Positive    4532
Negative    2527
Neutral      694
Name: class, dtype: int64

Number of class in Kompas data:
Positive    2140
Negative     991
Neutral      805
Name: class, dtype: int64

Number of class in Detik data:
Positive    3841
Negative    2605
Neutral     1672
Name: class, dtype: int64


## 3 Create Number of Daily Sentiment

### General News - Okezone

In [27]:
# Okezone Data
unique_date_okezone = data_okezone['date'].unique() # Vector of unique date
num_pos_okezone = np.repeat(0,len(unique_date_okezone))
num_neg_okezone = np.repeat(0,len(unique_date_okezone))
num_net_okezone = np.repeat(0,len(unique_date_okezone))
for i in range(len(unique_date_okezone)):
    for j in range(data_okezone.shape[0]):
        if data_okezone['date'][j] == unique_date_okezone[i]:
            if data_okezone['class'][j] == 'Positive':
                num_pos_okezone[i] += 1
            if data_okezone['class'][j] == 'Negative':
                num_neg_okezone[i] += 1
            if data_okezone['class'][j] == 'Neutral':
                num_net_okezone[i] += 1

In [28]:
# Note: axis = 1 means it will be merged in column
total = data_okezone['date'].value_counts()
df_number_senti_okezone = pd.concat([pd.DataFrame(unique_date_okezone),pd.DataFrame(num_pos_okezone),
                                     pd.DataFrame(num_neg_okezone),pd.DataFrame(num_net_okezone)],axis=1)
df_number_senti_okezone.columns = ['date','num_pos_okezone','num_neg_okezone','num_net_okezone']

In [29]:
# Daily volume news of Okezone
vol_okezone = pd.DataFrame(data_okezone['date'].value_counts()).reset_index()
vol_okezone.columns = ['date','total_okezone']

In [30]:
# Merge number of sentiment and total news
df_number_senti_okezone = pd.merge(df_number_senti_okezone,vol_okezone,on='date',how='left')
print('Dimension of Okezone data:{}'.format(df_number_senti_okezone.shape))
df_number_senti_okezone.head()

Dimension of Okezone data:(363, 5)


Unnamed: 0,date,num_pos_okezone,num_neg_okezone,num_net_okezone,total_okezone
0,01 Januari 2019,9,1,2,12
1,02 Januari 2019,20,12,1,33
2,03 Januari 2019,23,7,3,33
3,04 Januari 2019,26,8,3,37
4,05 Januari 2019,10,3,1,14


### JCI News - Okezone

In [32]:
# Okezone Data
ihsg_re = re.compile(r'IHSG')
unique_date_okezone = data_okezone['date'].unique() # Vector of unique date
num_pos_okezone_jci = np.repeat(0,len(unique_date_okezone))
num_neg_okezone_jci = np.repeat(0,len(unique_date_okezone))
num_net_okezone_jci = np.repeat(0,len(unique_date_okezone))
for i in range(len(unique_date_okezone)):
    for j in range(data_okezone.shape[0]):
        con = len(ihsg_re.findall(data_okezone.iloc[j]['title']))
        if data_okezone['date'][j] == unique_date_okezone[i] and con:
            if data_okezone['class'][j] == 'Positive':
                num_pos_okezone_jci[i] += 1
            if data_okezone['class'][j] == 'Negative':
                num_neg_okezone_jci[i] += 1
            if data_okezone['class'][j] == 'Neutral':
                num_net_okezone_jci[i] += 1

In [33]:
# Note: axis = 1 means it will be merged in column
total = data_okezone['date'].value_counts()
df_number_senti_okezone_jci = pd.concat([pd.DataFrame(unique_date_okezone),pd.DataFrame(num_pos_okezone_jci),
                                         pd.DataFrame(num_neg_okezone_jci),pd.DataFrame(num_net_okezone_jci)],
                                         axis=1)
df_number_senti_okezone_jci.columns = ['date','num_pos_okezone_jci','num_neg_okezone_jci','num_net_okezone_jci']

In [34]:
# Merge number of sentiment and total news
df_number_senti_okezone_jci['total_okezone_jci'] = df_number_senti_okezone_jci['num_pos_okezone_jci']+\
df_number_senti_okezone_jci['num_neg_okezone_jci']+df_number_senti_okezone_jci['num_net_okezone_jci']
print('Dimension of JCI Okezone data:{}'.format(df_number_senti_okezone_jci.shape))
df_number_senti_okezone_jci.head()

Dimension of JCI Okezone data:(363, 5)


Unnamed: 0,date,num_pos_okezone_jci,num_neg_okezone_jci,num_net_okezone_jci,total_okezone_jci
0,01 Januari 2019,1,0,0,1
1,02 Januari 2019,3,2,0,5
2,03 Januari 2019,3,1,0,4
3,04 Januari 2019,2,1,1,4
4,05 Januari 2019,0,0,0,0


In [35]:
df_okezone = pd.merge(df_number_senti_okezone,df_number_senti_okezone_jci,on='date',how='left')
df_okezone.head(3)

Unnamed: 0,date,num_pos_okezone,num_neg_okezone,num_net_okezone,total_okezone,num_pos_okezone_jci,num_neg_okezone_jci,num_net_okezone_jci,total_okezone_jci
0,01 Januari 2019,9,1,2,12,1,0,0,1
1,02 Januari 2019,20,12,1,33,3,2,0,5
2,03 Januari 2019,23,7,3,33,3,1,0,4


### General News - Kompas

In [36]:
# Kompas Data
unique_date_kompas = data_kompas['date'].unique() # Vector of unique date
num_pos_kompas = np.repeat(0,len(unique_date_kompas))
num_neg_kompas = np.repeat(0,len(unique_date_kompas))
num_net_kompas = np.repeat(0,len(unique_date_kompas))
for i in range(len(unique_date_kompas)):
    for j in range(data_kompas.shape[0]):
        if data_kompas['date'][j] == unique_date_kompas[i]:
            if data_kompas['class'][j] == 'Positive':
                num_pos_kompas[i] += 1
            if data_kompas['class'][j] == 'Negative':
                num_neg_kompas[i] += 1
            if data_kompas['class'][j] == 'Neutral':
                num_net_kompas[i] += 1

In [37]:
# Note: axis = 1 means it will be merged in column
total = data_kompas['date'].value_counts()
df_number_senti_kompas = pd.concat([pd.DataFrame(unique_date_kompas),pd.DataFrame(num_pos_kompas),
                                    pd.DataFrame(num_neg_kompas),pd.DataFrame(num_net_kompas)],axis=1)
# Daily of volume news
df_number_senti_kompas.columns = ['date','num_pos_kompas','num_neg_kompas','num_net_kompas']

In [38]:
# Volume news of Kompas
vol_kompas = pd.DataFrame(data_kompas['date'].value_counts()).reset_index()
vol_kompas.columns = ['date','total_kompas']

In [39]:
# Merge number of sentiment and total news
df_number_senti_kompas = pd.merge(df_number_senti_kompas,vol_kompas,on='date',how='left')
print('Dimension of Kompas data:{}'.format(df_number_senti_kompas.shape))
df_number_senti_kompas.head()

Dimension of Kompas data:(360, 5)


Unnamed: 0,date,num_pos_kompas,num_neg_kompas,num_net_kompas,total_kompas
0,01 Januari 2019,0,1,2,3
1,02 Januari 2019,7,6,1,14
2,03 Januari 2019,14,1,2,17
3,04 Januari 2019,8,2,1,11
4,05 Januari 2019,1,0,1,2


### JCI News - Kompas

In [40]:
# Kompas Data
ihsg_re = re.compile(r'IHSG')
unique_date_kompas = data_kompas['date'].unique() # Vector of unique date
num_pos_kompas_jci = np.repeat(0,len(unique_date_kompas))
num_neg_kompas_jci = np.repeat(0,len(unique_date_kompas))
num_net_kompas_jci = np.repeat(0,len(unique_date_kompas))
for i in range(len(unique_date_kompas)):
    for j in range(data_kompas.shape[0]):
        con = len(ihsg_re.findall(data_kompas.iloc[j]['title']))
        if data_kompas['date'][j] == unique_date_kompas[i] and con:
            if data_kompas['class'][j] == 'Positive':
                num_pos_kompas_jci[i] += 1
            if data_kompas['class'][j] == 'Negative':
                num_neg_kompas_jci[i] += 1
            if data_kompas['class'][j] == 'Neutral':
                num_net_kompas_jci[i] += 1

In [41]:
# Note: axis = 1 means it will be merged in column
total = data_kompas['date'].value_counts()
df_number_senti_kompas_jci = pd.concat([pd.DataFrame(unique_date_kompas),pd.DataFrame(num_pos_kompas_jci),
                                        pd.DataFrame(num_neg_kompas_jci),pd.DataFrame(num_net_kompas_jci)],axis=1)
df_number_senti_kompas_jci.columns = ['date','num_pos_kompas_jci','num_neg_kompas_jci','num_net_kompas_jci']

In [42]:
# Merge number of sentiment and total news
df_number_senti_kompas_jci['total_kompas_jci'] = df_number_senti_kompas_jci['num_pos_kompas_jci']+\
df_number_senti_kompas_jci['num_neg_kompas_jci']+df_number_senti_kompas_jci['num_net_kompas_jci']
print('Dimension of JCI Kompas data:{}'.format(df_number_senti_kompas_jci.shape))
df_number_senti_kompas_jci.head()

Dimension of JCI Kompas data:(360, 5)


Unnamed: 0,date,num_pos_kompas_jci,num_neg_kompas_jci,num_net_kompas_jci,total_kompas_jci
0,01 Januari 2019,0,0,0,0
1,02 Januari 2019,1,0,0,1
2,03 Januari 2019,0,0,0,0
3,04 Januari 2019,0,0,0,0
4,05 Januari 2019,0,0,0,0


In [43]:
df_kompas = pd.merge(df_number_senti_kompas,df_number_senti_kompas_jci,on='date',how='left')
df_kompas.head(3)

Unnamed: 0,date,num_pos_kompas,num_neg_kompas,num_net_kompas,total_kompas,num_pos_kompas_jci,num_neg_kompas_jci,num_net_kompas_jci,total_kompas_jci
0,01 Januari 2019,0,1,2,3,0,0,0,0
1,02 Januari 2019,7,6,1,14,1,0,0,1
2,03 Januari 2019,14,1,2,17,0,0,0,0


### General News - Detik

In [44]:
# Detik Data
unique_date_detik = data_detik['date'].unique() # Vector of unique date
num_pos_detik = np.repeat(0,len(unique_date_detik))
num_neg_detik = np.repeat(0,len(unique_date_detik))
num_net_detik = np.repeat(0,len(unique_date_detik))
for i in range(len(unique_date_detik)):
    for j in range(data_detik.shape[0]):
        if data_detik['date'][j] == unique_date_detik[i]:
            if data_detik['class'][j] == 'Positive':
                num_pos_detik[i] += 1
            if data_detik['class'][j] == 'Negative':
                num_neg_detik[i] += 1
            if data_detik['class'][j] == 'Neutral':
                num_net_detik[i] += 1

In [45]:
# Note: axis = 1 means it will be merged in column
total = data_detik['date'].value_counts()
df_number_senti_detik = pd.concat([pd.DataFrame(unique_date_detik),pd.DataFrame(num_pos_detik),
                                   pd.DataFrame(num_neg_detik),pd.DataFrame(num_net_detik)],axis=1)
# Daily of volume news
df_number_senti_detik.columns = ['date','num_pos_detik','num_neg_detik','num_net_detik']

In [46]:
# Volume news of Kompas
vol_detik = pd.DataFrame(data_detik['date'].value_counts()).reset_index()
vol_detik.columns = ['date','total_detik']

In [47]:
# Merge number of sentiment and total news
df_number_senti_detik = pd.merge(df_number_senti_detik,vol_detik,on='date',how='left')
print('Dimension of Detik data:{}'.format(df_number_senti_detik.shape))
df_number_senti_detik.head()

Dimension of Detik data:(363, 5)


Unnamed: 0,date,num_pos_detik,num_neg_detik,num_net_detik,total_detik
0,01 Januari 2019,3,1,4,8
1,02 Januari 2019,17,10,5,32
2,03 Januari 2019,12,5,6,23
3,04 Januari 2019,22,9,5,36
4,05 Januari 2019,7,3,1,11


### JCI News - Detik

In [48]:
# Detik Data
ihsg_re = re.compile(r'IHSG')
unique_date_detik = data_detik['date'].unique() # Vector of unique date
num_pos_detik_jci = np.repeat(0,len(unique_date_detik))
num_neg_detik_jci = np.repeat(0,len(unique_date_detik))
num_net_detik_jci = np.repeat(0,len(unique_date_detik))
for i in range(len(unique_date_detik)):
    for j in range(data_detik.shape[0]):
        con = len(ihsg_re.findall(data_detik.iloc[j]['title']))
        if data_detik['date'][j] == unique_date_detik[i] and con:
            if data_detik['class'][j] == 'Positive':
                num_pos_detik_jci[i] += 1
            if data_detik['class'][j] == 'Negative':
                num_neg_detik_jci[i] += 1
            if data_detik['class'][j] == 'Neutral':
                num_net_detik_jci[i] += 1

In [75]:
# Note: axis = 1 means it will be merged in column
total = data_detik['date'].value_counts()
df_number_senti_detik_jci = pd.concat([pd.DataFrame(unique_date_detik),pd.DataFrame(num_pos_detik_jci),
                                       pd.DataFrame(num_neg_detik_jci),pd.DataFrame(num_net_detik_jci)],axis=1)
df_number_senti_detik_jci.columns = ['date','num_pos_detik_jci','num_neg_detik_jci','num_net_detik_jci']

In [76]:
# Merge number of sentiment and total news
df_number_senti_detik_jci['total_detik_jci'] = df_number_senti_detik_jci['num_pos_detik_jci']+\
df_number_senti_detik_jci['num_neg_detik_jci']+df_number_senti_detik_jci['num_net_detik_jci']
print('Dimension of JCI Detik data:{}'.format(df_number_senti_detik_jci.shape))
df_number_senti_detik_jci.head()

Dimension of JCI Detik data:(363, 5)


Unnamed: 0,date,num_pos_detik_jci,num_neg_detik_jci,num_net_detik_jci,total_detik_jci
0,01 Januari 2019,0,0,0,0
1,02 Januari 2019,3,2,0,5
2,03 Januari 2019,4,0,0,4
3,04 Januari 2019,3,1,0,4
4,05 Januari 2019,0,0,0,0


In [77]:
df_detik = pd.merge(df_number_senti_detik,df_number_senti_detik_jci,on='date',how='left')
df_detik.head(3)

Unnamed: 0,date,num_pos_detik,num_neg_detik,num_net_detik,total_detik,num_pos_detik_jci,num_neg_detik_jci,num_net_detik_jci,total_detik_jci
0,01 Januari 2019,3,1,4,8,0,0,0,0
1,02 Januari 2019,17,10,5,32,3,2,0,5
2,03 Januari 2019,12,5,6,23,4,0,0,4


In [78]:
# List of date
start = datetime.strptime('2019-01-01','%Y-%m-%d')
end = datetime.strptime('2019-12-31','%Y-%m-%d')
list_date = pd.date_range(start,end).strftime('%d %m %Y').tolist()
# Modify list of date
list_date_new = []
for i in range(len(list_date)):
    c = []
    a = list_date[i].split()[1]
    if a == '01': c = 'Januari'
    if a == '02': c = 'Februari'
    if a == '03': c = 'Maret'
    if a == '04': c = 'April'
    if a == '05': c = 'Mei'
    if a == '06': c = 'Juni'
    if a == '07': c = 'Juli'
    if a == '08': c = 'Agustus'
    if a == '09': c = 'September'
    if a == '10': c = 'Oktober'
    if a == '11': c = 'November'
    if a == '12': c = 'Desember'
    d = list_date[i].split()[0]+' '+c+' '+list_date[i].split()[2]
    list_date_new.append(d)

In [79]:
list_date_new = pd.Series(list_date_new,name='date')

In [80]:
# Merge all financial news
df_news = pd.merge(list_date_new,df_okezone,on='date',how='left')
df_news = pd.merge(df_news,df_kompas,on='date',how='left')
df_news = pd.merge(df_news,df_detik,on='date',how='left')
print('Dimension of financial news data:{}'.format(df_news.shape))
# Cgeck missing value or NaN
print(df_news.isnull().sum())
# Check list which is not included
print('\nOkezone:\n{}'.format(list_date_new[~list_date_new.isin(df_okezone['date'])]))
print('\nKompas:\n{}'.format(list_date_new[~list_date_new.isin(df_kompas['date'])]))
print('\nDetik:\n{}'.format(list_date_new[~list_date_new.isin(df_detik['date'])]))

Dimension of financial news data:(365, 25)
date                   0
num_pos_okezone        2
num_neg_okezone        2
num_net_okezone        2
total_okezone          2
num_pos_okezone_jci    2
num_neg_okezone_jci    2
num_net_okezone_jci    2
total_okezone_jci      2
num_pos_kompas         5
num_neg_kompas         5
num_net_kompas         5
total_kompas           5
num_pos_kompas_jci     5
num_neg_kompas_jci     5
num_net_kompas_jci     5
total_kompas_jci       5
num_pos_detik          2
num_neg_detik          2
num_net_detik          2
total_detik            2
num_pos_detik_jci      2
num_neg_detik_jci      2
num_net_detik_jci      2
total_detik_jci        2
dtype: int64

Okezone:
152        02 Juni 2019
364    31 Desember 2019
Name: date, dtype: object

Kompas:
11      12 Januari 2019
40     10 Februari 2019
166        16 Juni 2019
340    07 Desember 2019
364    31 Desember 2019
Name: date, dtype: object

Detik:
341    08 Desember 2019
364    31 Desember 2019
Name: date, dtype: objec

In [81]:
# Fill NaN in Kompas
a = list_date_new[~list_date_new.isin(df_okezone['date'])]
list_var_okezone_na = ['num_neg_okezone','num_pos_okezone','num_net_okezone','total_okezone',
                       'num_neg_okezone_jci','num_pos_okezone_jci','num_net_okezone_jci','total_okezone_jci']
for i in list_var_okezone_na:
    df_news['{}'.format(i)].fillna(0, inplace=True)
# Check NaN
df_news.iloc[a.index][list_var_okezone_na]

Unnamed: 0,num_neg_okezone,num_pos_okezone,num_net_okezone,total_okezone,num_neg_okezone_jci,num_pos_okezone_jci,num_net_okezone_jci,total_okezone_jci
152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
# Fill NaN in Kompas
b = list_date_new[~list_date_new.isin(df_kompas['date'])]
list_var_kompas_na = ['num_neg_kompas','num_pos_kompas','num_net_kompas','total_kompas',
                      'num_neg_kompas_jci','num_pos_kompas_jci','num_net_kompas_jci','total_kompas_jci']
for i in list_var_kompas_na:
    df_news['{}'.format(i)].fillna(0, inplace=True)
# Check NaN
df_news.iloc[b.index][list_var_kompas_na]

Unnamed: 0,num_neg_kompas,num_pos_kompas,num_net_kompas,total_kompas,num_neg_kompas_jci,num_pos_kompas_jci,num_net_kompas_jci,total_kompas_jci
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
# Fill NaN in Detik
c = list_date_new[~list_date_new.isin(df_detik['date'])]
list_var_detik_na = ['num_neg_detik','num_pos_detik','num_net_detik','total_detik',
                     'num_neg_detik_jci','num_pos_detik_jci','num_net_detik_jci','total_detik_jci']
for i in list_var_detik_na:
    df_news['{}'.format(i)].fillna(0, inplace=True)
# Check NaN
df_news.iloc[c.index][list_var_detik_na]

Unnamed: 0,num_neg_detik,num_pos_detik,num_net_detik,total_detik,num_neg_detik_jci,num_pos_detik_jci,num_net_detik_jci,total_detik_jci
341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# Data with no NaN
print(df_news.isnull().sum())
df_news.head(3)

date                   0
num_pos_okezone        0
num_neg_okezone        0
num_net_okezone        0
total_okezone          0
num_pos_okezone_jci    0
num_neg_okezone_jci    0
num_net_okezone_jci    0
total_okezone_jci      0
num_pos_kompas         0
num_neg_kompas         0
num_net_kompas         0
total_kompas           0
num_pos_kompas_jci     0
num_neg_kompas_jci     0
num_net_kompas_jci     0
total_kompas_jci       0
num_pos_detik          0
num_neg_detik          0
num_net_detik          0
total_detik            0
num_pos_detik_jci      0
num_neg_detik_jci      0
num_net_detik_jci      0
total_detik_jci        0
dtype: int64


Unnamed: 0,date,num_pos_okezone,num_neg_okezone,num_net_okezone,total_okezone,num_pos_okezone_jci,num_neg_okezone_jci,num_net_okezone_jci,total_okezone_jci,num_pos_kompas,...,num_net_kompas_jci,total_kompas_jci,num_pos_detik,num_neg_detik,num_net_detik,total_detik,num_pos_detik_jci,num_neg_detik_jci,num_net_detik_jci,total_detik_jci
0,01 Januari 2019,9.0,1.0,2.0,12.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,3.0,1.0,4.0,8.0,0.0,0.0,0.0,0.0
1,02 Januari 2019,20.0,12.0,1.0,33.0,3.0,2.0,0.0,5.0,7.0,...,0.0,1.0,17.0,10.0,5.0,32.0,3.0,2.0,0.0,5.0
2,03 Januari 2019,23.0,7.0,3.0,33.0,3.0,1.0,0.0,4.0,14.0,...,0.0,0.0,12.0,5.0,6.0,23.0,4.0,0.0,0.0,4.0


## 4 Create Day and Month

In [85]:
month_re = re.compile(r'[a-zA-Z]+')
date_re = re.compile(r'^[0-9]{2}')
month_list = []
# Get all months in the data
for i in range(df_news.shape[0]):
    matches = month_re.findall(df_news.iloc[i]['date'])
    month_list.append(matches[0])
np.unique(month_list)

array(['Agustus', 'April', 'Desember', 'Februari', 'Januari', 'Juli',
       'Juni', 'Maret', 'Mei', 'November', 'Oktober', 'September'],
      dtype='<U9')

In [86]:
# Add column month
df_news.insert(loc = 1, column = 'month', value = month_list)
df_news.columns

Index(['date', 'month', 'num_pos_okezone', 'num_neg_okezone',
       'num_net_okezone', 'total_okezone', 'num_pos_okezone_jci',
       'num_neg_okezone_jci', 'num_net_okezone_jci', 'total_okezone_jci',
       'num_pos_kompas', 'num_neg_kompas', 'num_net_kompas', 'total_kompas',
       'num_pos_kompas_jci', 'num_neg_kompas_jci', 'num_net_kompas_jci',
       'total_kompas_jci', 'num_pos_detik', 'num_neg_detik', 'num_net_detik',
       'total_detik', 'num_pos_detik_jci', 'num_neg_detik_jci',
       'num_net_detik_jci', 'total_detik_jci'],
      dtype='object')

In [87]:
# Add date in formal format
dict_month = {'01':'Januari','02':'Februari','03':'Maret','04':'April','05':'Mei','06':'Juni',
              '07':'Juli','08':'Agustus','09':'September','10':'Oktober','11':'November','12':'Desember'}
day_re = re.compile(r'^[0-9]{2}')
year_re = re.compile(r'[0-9]{4}')
date_list = []
for i in range(df_news.shape[0]):
    day   = day_re.findall(df_news.iloc[i]['date'])
    month = month_re.findall(df_news.iloc[i]['date'])
    year  = year_re.findall(df_news.iloc[i]['date'])
    # Convert month in string to numerical value
    for num,month_name in dict_month.items():
        if month_name == month[0]:
            month_num = num
    date_new = day[0] + '-' + month_num + '-' + year[0]
    date_list.append(date_new)
print('Sample of date: {}'.format(date_list[1]))

Sample of date: 02-01-2019


In [88]:
df_news.insert(loc = 1, column = 'date_format', value = date_list)
df_news['date_format'] = pd.to_datetime(df_news['date_format'],format = '%d-%m-%Y')

In [89]:
df_news.tail()

Unnamed: 0,date,date_format,month,num_pos_okezone,num_neg_okezone,num_net_okezone,total_okezone,num_pos_okezone_jci,num_neg_okezone_jci,num_net_okezone_jci,...,num_net_kompas_jci,total_kompas_jci,num_pos_detik,num_neg_detik,num_net_detik,total_detik,num_pos_detik_jci,num_neg_detik_jci,num_net_detik_jci,total_detik_jci
360,27 Desember 2019,2019-12-27,Desember,11.0,8.0,3.0,22.0,4.0,0.0,0.0,...,0.0,0.0,14.0,11.0,14.0,39.0,5.0,0.0,0.0,5.0
361,28 Desember 2019,2019-12-28,Desember,0.0,5.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,5.0,9.0,17.0,1.0,0.0,1.0,2.0
362,29 Desember 2019,2019-12-29,Desember,2.0,1.0,1.0,4.0,1.0,0.0,0.0,...,0.0,0.0,1.0,3.0,8.0,12.0,0.0,0.0,0.0,0.0
363,30 Desember 2019,2019-12-30,Desember,9.0,4.0,3.0,16.0,2.0,3.0,0.0,...,1.0,2.0,12.0,13.0,12.0,37.0,2.0,6.0,0.0,8.0
364,31 Desember 2019,2019-12-31,Desember,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365 entries, 0 to 364
Data columns (total 27 columns):
date                   365 non-null object
date_format            365 non-null datetime64[ns]
month                  365 non-null object
num_pos_okezone        365 non-null float64
num_neg_okezone        365 non-null float64
num_net_okezone        365 non-null float64
total_okezone          365 non-null float64
num_pos_okezone_jci    365 non-null float64
num_neg_okezone_jci    365 non-null float64
num_net_okezone_jci    365 non-null float64
total_okezone_jci      365 non-null float64
num_pos_kompas         365 non-null float64
num_neg_kompas         365 non-null float64
num_net_kompas         365 non-null float64
total_kompas           365 non-null float64
num_pos_kompas_jci     365 non-null float64
num_neg_kompas_jci     365 non-null float64
num_net_kompas_jci     365 non-null float64
total_kompas_jci       365 non-null float64
num_pos_detik          365 non-null float64
num_neg_de

In [91]:
# Add day
day_name = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
day_news = []
for i in range(df_news.shape[0]):
    c = df_news['date_format'][i].weekday()
    day_news.append(day_name[c])

In [92]:
df_news.insert(loc = 2, column = 'day', value = day_news)

In [93]:
df_news.head(3)

Unnamed: 0,date,date_format,day,month,num_pos_okezone,num_neg_okezone,num_net_okezone,total_okezone,num_pos_okezone_jci,num_neg_okezone_jci,...,num_net_kompas_jci,total_kompas_jci,num_pos_detik,num_neg_detik,num_net_detik,total_detik,num_pos_detik_jci,num_neg_detik_jci,num_net_detik_jci,total_detik_jci
0,01 Januari 2019,2019-01-01,Tuesday,Januari,9.0,1.0,2.0,12.0,1.0,0.0,...,0.0,0.0,3.0,1.0,4.0,8.0,0.0,0.0,0.0,0.0
1,02 Januari 2019,2019-01-02,Wednesday,Januari,20.0,12.0,1.0,33.0,3.0,2.0,...,0.0,1.0,17.0,10.0,5.0,32.0,3.0,2.0,0.0,5.0
2,03 Januari 2019,2019-01-03,Thursday,Januari,23.0,7.0,3.0,33.0,3.0,1.0,...,0.0,0.0,12.0,5.0,6.0,23.0,4.0,0.0,0.0,4.0


## Save the Data

In [94]:
df_news.to_csv('Datasets/interim/Aggregation Data of Financial News.csv',index=False)