<a href="https://colab.research.google.com/github/angel870326/Monthly-Revenue/blob/main/003_data_updating_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

> 2023.03.03 Ssu-Yun Wang<br/>
[Github @angel870326](https://github.com/angel870326)

# **Update Missing Values from TWSE MOPS**

### Contents

1.  Read Data
    *   Original Data
    *   Data to be Updated
2.  Update Missing Values from TWSE MOPS
3.  Concatenate Datasets
4.  Output Data







In [1]:
# sConnect to the Google Drive
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
import pandas as pd
import numpy as np

## **1. Read Data**


In [3]:
# Data path
original_data_path = '/content/gdrive/Shareddrives/Me/論文/資料集/001_v1'
updated_data_path = '/content/gdrive/Shareddrives/Me/論文/資料集/002_v1'

### **1.1 Original Data**


In [4]:
# 金融業：中間無缺值與有缺值
data_fin0 = pd.read_excel(os.path.join(original_data_path, '198801-202212上市櫃公司月營收_金融業_中間無缺值.xlsx'), index_col=0)
data_fin1 = pd.read_excel(os.path.join(original_data_path, '198801-202212上市櫃公司月營收_金融業_中間有缺值.xlsx'), index_col=0)
print("Data shape:", data_fin0.shape)
print("Data shape:", data_fin1.shape)

Data shape: (40, 420)
Data shape: (2, 420)


In [5]:
# 非金融業：中間無缺值與有缺值
data_nonfin0 = pd.read_excel(os.path.join(original_data_path, '198801-202212上市櫃公司月營收_非金融業_中間無缺值.xlsx'), index_col=0)
data_nonfin1 = pd.read_excel(os.path.join(original_data_path, '198801-202212上市櫃公司月營收_非金融業_中間有缺值.xlsx'), index_col=0)
print("Data shape:", data_nonfin0.shape)
print("Data shape:", data_nonfin1.shape)

Data shape: (1595, 420)
Data shape: (143, 420)


In [6]:
data_nonfin1.head()

Unnamed: 0_level_0,1988-01,1988-02,1988-03,1988-04,1988-05,1988-06,1988-07,1988-08,1988-09,1988-10,...,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12
公司,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1240 茂生農經,,,,,,,,,,,...,274367,290585,292443,292475,262129,281860.0,281078,312492,286961,309199
1256 鮮活果汁-KY,,,,,,,,,,,...,333241,194594,399569,409929,475276,581403.0,482664,142872,183937,340614
1337 再生-KY,,,,,,,,,,,...,30368,53408,45114,48002,53666,60295.0,65786,48082,53325,73922
1338 廣華-KY,,,,,,,,,,,...,722919,417500,509475,685917,705668,617345.0,635751,601039,602967,464981
1339 昭輝,,,,,,,,,,,...,160802,177505,166991,155001,179434,175239.0,151164,170130,177450,154272


### **1.2 Data to be Updated**

From *002_data_missing_v1.ipynb*


In [7]:
# 補值列表
fin_nan_month = pd.read_excel(os.path.join(updated_data_path,'上市櫃公司月營收_金融業_補值.xlsx'))
nonfin_nan_month = pd.read_excel(os.path.join(updated_data_path,'上市櫃公司月營收_非金融業_補值.xlsx'))
print("金融業", fin_nan_month.shape)
print("非金融業", nonfin_nan_month.shape)

金融業 (143, 3)
非金融業 (2643, 3)


In [8]:
nonfin_nan_month.head()

Unnamed: 0,公司,年月,當月營收（千元）
0,1240 茂生農經,2004-10,
1,1240 茂生農經,2004-11,
2,1240 茂生農經,2004-12,
3,1240 茂生農經,2005-01,
4,1240 茂生農經,2005-02,


## **2. Update Missing Values from TWSE MOPS**

根據補值列表更新原始檔案

In [9]:
# 計算原始檔案有多少缺失值（用於後續確認是否正確補值）
fin_nan_count_o = data_fin1.isna().sum().sum()
nonfin_nan_count_o = data_nonfin1.isna().sum().sum()

# 計算需補值的數量（補值表中不是空值的個數）
fin_to_update = fin_nan_month['當月營收（千元）'].notna().sum()
nonfin_to_update = nonfin_nan_month['當月營收（千元）'].notna().sum()

print("   data_fin1 NaN 總數：{}   待補值數：{}\ndata_nonfin1 NaN 總數：{} 待補值數：{}".format(fin_nan_count_o, fin_to_update, nonfin_nan_count_o, nonfin_to_update))

   data_fin1 NaN 總數：431   待補值數：10
data_nonfin1 NaN 總數：32882 待補值數：439


In [10]:
# Update the original data
def updateMissingData(originalD: pd.DataFrame, updatedD: pd.DataFrame):
  for index, row in updatedD.iterrows():
    revenue = row['當月營收（千元）']
    if ~np.isnan(revenue): # if not NaN
      originalD.at[row['公司'], row['年月']] = revenue

updateMissingData(data_fin1, fin_nan_month)
updateMissingData(data_nonfin1, nonfin_nan_month)

In [11]:
# 計算補完值後有多少缺失值
fin_nan_count_new = data_fin1.isna().sum().sum()
nonfin_nan_count_new = data_nonfin1.isna().sum().sum()

# 確認是否正確補值
check_fin = check_nonfin = "error"
if fin_nan_count_new == fin_nan_count_o - fin_to_update:
  check_fin = "correct"
if nonfin_nan_count_new == nonfin_nan_count_o - nonfin_to_update:
  check_nonfin = "correct"

print("   data_fin1 更新後 NaN 總數：{}   ({})\ndata_nonfin1 更新後 NaN 總數：{} ({})".format(fin_nan_count_o, check_fin, nonfin_nan_count_o, check_nonfin))

   data_fin1 更新後 NaN 總數：431   (correct)
data_nonfin1 更新後 NaN 總數：32882 (correct)


## **3. Concatenate Datasets**

data_fin_new & data_nonfin_new


In [12]:
# Sort the dataset by company id after concatenating
data_fin_new = pd.concat([data_fin0, data_fin1]).sort_index(ascending=True)
data_nonfin_new = pd.concat([data_nonfin0, data_nonfin1]).sort_index(ascending=True)

data_fin_new

Unnamed: 0_level_0,1988-01,1988-02,1988-03,1988-04,1988-05,1988-06,1988-07,1988-08,1988-09,1988-10,...,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12
公司,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2801 彰銀,2028646.0,1933402.0,2369034.0,2024382.0,2405514.0,2517314.0,2285566.0,2399543.0,2354405.0,2683718.0,...,2822015,2930960,2575378,3025853,3549913,3265931,2360910,2707237,3149417,3143300
2809 京城銀,100860.0,102567.0,105262.0,107897.0,113453.0,103138.0,115815.0,119916.0,122163.0,122763.0,...,-347984,120220,888099,106096,1136280,905008,-818107,525434,1159759,729033
2812 台中銀,198237.0,185151.0,203855.0,198102.0,208537.0,215686.0,216227.0,220798.0,221718.0,229814.0,...,1265168,1080334,1262711,1251588,1436922,1440249,1218436,1186838,1340389,1338751
2816 旺旺保,,,,,,,,,,,...,740703,564882,743451,540176,813502,758209,682127,737689,821500,810501
2820 華票,,,,,,,,,,,...,149300,72703,78783,182393,73896,249065,145673,57005,-18652,201445
2832 台產,,,,,,,,,,,...,491847,476766,444289,593308,563873,521635,502544,497959,454014,516059
2834 臺企銀,,,,,,,,,,,...,2049170,2075720,2131903,2511580,2906096,2563993,2393021,2937436,2285810,2627259
2836 高雄銀,,,,,,,,,,,...,350498,320651,315606,395859,309915,303639,295929,301110,312340,308750
2838 聯邦銀,,,,,,,,,,,...,946213,1063726,1383324,975695,1561893,1617823,1297400,1308647,1530112,1393577
2845 遠東銀,,,,,,,,,,,...,866867,1025486,984628,932458,1028245,1128926,709086,1026802,1163514,1189258


## **4. Output Data**



In [13]:
# Output data path
output_data_path = '/content/gdrive/Shareddrives/Me/論文/資料集'

In [14]:
# 中間有缺值的公司補值後的更新檔
data_fin1.to_excel(os.path.join(output_data_path,'198801-202212上市櫃公司月營收_金融業_中間有缺值_new.xlsx'))
data_nonfin1.to_excel(os.path.join(output_data_path,'198801-202212上市櫃公司月營收_非金融業_中間有缺值_new.xlsx'))
print("Data shape:", data_fin1.shape)
print("Data shape:", data_nonfin1.shape)

Data shape: (2, 420)
Data shape: (143, 420)


In [15]:
# 最終合併檔
data_fin_new.to_excel(os.path.join(output_data_path,'198801-202212上市櫃公司月營收_金融業_final.xlsx'))
data_nonfin_new.to_excel(os.path.join(output_data_path,'198801-202212上市櫃公司月營收_非金融業_final.xlsx'))
print("Data shape:", data_fin_new.shape)
print("Data shape:", data_nonfin_new.shape)

Data shape: (42, 420)
Data shape: (1738, 420)
