## NOTE

#### 100만 개 데이터 기준

1. 몽고디비에서 'plbc.ContainerIoResult', 'plbc.ContainerInOut' 두 컬렉션의 전체 데이터 가져오기
2. plbcContainerIoResult는 'copionSeq'기준, plbcContainerInOut는 '_id' 기준으로 join 진행
3. join 결과 csv파일로 저장

    >>> 각 단계별로 시간 얼마나 걸리는지 체크할 것.

In [1]:
from pymongo import MongoClient
import pymongo
import pandas as pd
import numpy as np
import time
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor
import json

In [2]:
mongodb_URI = "mongodb://splia:splia123!%40%23@211.215.18.231:27018/?authSource=admin&readPreference=primary&appname=MongoDB%20Compass&ssl=false"
client = MongoClient(mongodb_URI)
db = client.plbc

In [3]:
def get_all_data_in_chunks(collection):
    cursor = collection.find().sort('rgstDem', pymongo.DESCENDING).allow_disk_use(True).limit(1000000)
    while True:
        chunk = list(cursor)
        if not chunk:
            break
        yield chunk

def process_chunk(chunk):
    return [{fieldname: entry.get(fieldname, '') for fieldname in fieldnames} for entry in chunk]


---

# 1. 'plbc.ContainerIoResult', 'plbc.ContainerInOut' 전체 데이터 가져오기

## 1-1. plbcContainerIoResult

In [4]:
collection = db.plbcContainerIoResult

sample_document = collection.find_one()
fieldnames = set(sample_document.keys()) if sample_document else set()

start_time = time.time()
chunks = get_all_data_in_chunks(collection)
data = []
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
    for future in futures:
        data.extend(future.result())
end_time = time.time()

loading_time = end_time - start_time
print("plbcContainerIoResult 데이터 로딩하는 데에 걸린 시간:", loading_time, "초")

plbcContainerIoResult 데이터 로딩하는 데에 걸린 시간: 64.57421779632568 초


In [5]:
# 데이터프레임 변환 시간
start_time = time.time()
plbcContainerIoResult = pd.DataFrame(data)
end_time = time.time()
changing_time = end_time - start_time
print("데이터프레임으로 변환하는 데에 걸린 시간:", changing_time, "초")

데이터프레임으로 변환하는 데에 걸린 시간: 1.2933681011199951 초


In [6]:
## 데이터 로딩 + 데이터프레임 변환 시간 합계
IoResult_total = loading_time + changing_time
IoResult_total

65.86758589744568

## 1-2. plbcContainerInOut

In [7]:
collection = db.plbcContainerInOut

sample_document = collection.find_one()
fieldnames = set(sample_document.keys()) if sample_document else set()

start_time = time.time()
chunks = get_all_data_in_chunks(collection)
data_InOut = []
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
    for future in futures:
        data_InOut.extend(future.result())
end_time = time.time()

loading_time = end_time - start_time
print("plbcContainerInOut 데이터 로딩하는 데에 걸린 시간:", loading_time, "초")

plbcContainerInOut 데이터 로딩하는 데에 걸린 시간: 123.83998012542725 초


In [8]:
# 데이터프레임 변환 시간
start_time = time.time()
plbcContainerInOut = pd.DataFrame(data_InOut)
end_time = time.time()
changing_time = end_time - start_time
print("데이터프레임으로 변환하는 데에 걸린 시간:", changing_time, "초")

데이터프레임으로 변환하는 데에 걸린 시간: 1.7459089756011963 초


In [9]:
## 데이터 로딩 + 데이터프레임 변환 시간 합계
InOut_total = loading_time + changing_time
InOut_total

125.58588910102844

---

# 2. 데이터 JOIN

#### plbcContainerIoResult는 'copionSeq'기준, plbcContainerInOut는 '_id' 기준으로 join 진행

#### [데이터정보]

1. plbc.ContainerIoResult
      - 전체 데이터 1,000,000개
      - 칼럼 13개
---
2. plbc.ContainerInOut
      - 전체 데이터 1,000,000개
      - 칼럼 24개

In [13]:
## 데이터 join

start_time = time.time()
merged_df = pd.merge(plbcContainerIoResult, plbcContainerInOut, left_on='copionSeq', right_on='_id')
end_time = time.time()

joining_time = end_time - start_time
print("데이터 join시간:", joining_time, "초")

데이터 join시간: 0.8037681579589844 초


In [14]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 37 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   ctiorReusltSts2    1000000 non-null  object        
 1   remark_x           1000000 non-null  object        
 2   rgstId_x           1000000 non-null  object        
 3   chgId_x            1000000 non-null  object        
 4   copionSeq          1000000 non-null  object        
 5   ctiorReusltSts3    1000000 non-null  object        
 6   chgDtm             1000000 non-null  datetime64[ns]
 7   ctiorReusltSts     1000000 non-null  object        
 8   rgstDem_x          1000000 non-null  datetime64[ns]
 9   ctiorReusltSts5    1000000 non-null  object        
 10  _id_x              1000000 non-null  object        
 11  ctiorReusltSts4    1000000 non-null  object        
 12  ctiorResultDhms    1000000 non-null  object        
 13  ctioIoFlag         1000000 n

#### [데이터정보]

1. merged_df
    - 데이터 총 1,000,000개
    - 칼럼 37개

---

# 3. join 결과 csv파일로 저장

In [15]:
start_time = time.time()
merged_df.to_csv('merged_df.csv', index=False)
end_time = time.time()

saving_time = end_time - start_time
print("CSV파일 저장시간:", saving_time, "초")

CSV파일 저장시간: 6.44290828704834 초


In [17]:
pd.read_csv('merged_df.csv')

  pd.read_csv('merged_df.csv')


Unnamed: 0,ctiorReusltSts2,remark_x,rgstId_x,chgId_x,copionSeq,ctiorReusltSts3,chgDtm,ctiorReusltSts,rgstDem_x,ctiorReusltSts5,...,ctioCntrNo1,carrierId,ctioCntr1p2Iso,ctioCntrNo1FmFlag,ctioReserved3,chgId_y,ctioReserved1,ctioFaultDesc,tmnlId,remark_y
0,,,SYSTEM,SYSTEM,bbbef37a562eef7d6104c00e56eafb43,,2024-02-06 15:00:17.101,OK,2024-02-06 15:00:17.101,,...,GAOU6602217,,,5.0,,SYSTEM,,,HPNTC010,
1,,,SYSTEM,SYSTEM,bbb25a24cf25fc9e090c683532bc7176,,2024-02-06 15:00:17.101,OK,2024-02-06 15:00:17.101,,...,,,,4.0,,SYSTEM,,,HPNTC010,
2,,,SYSTEM,SYSTEM,de7de91a1187b4363d16fd9957ac8f9f,,2024-02-06 15:00:17.101,OK,2024-02-06 15:00:17.101,,...,,,,4.0,,SYSTEM,,,HPNTC010,
3,,,SYSTEM,SYSTEM,4c660f46d77c02779365cb2b65e935a6,,2024-02-06 15:00:17.101,OK,2024-02-06 15:00:17.101,,...,BMOU9804188,,,5.0,,SYSTEM,,,HPNTC010,
4,,,SYSTEM,SYSTEM,d8db7a4b2521842f438f4e4aa0dddcb0,,2024-02-06 15:00:17.101,OK,2024-02-06 15:00:17.101,,...,,,,4.0,,SYSTEM,,,HPNTC010,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,,,SYSTEM,SYSTEM,1a54f1e683365dc8d93e7e2c1801a3bc,,2023-11-05 15:00:15.455,OK,2023-11-05 15:00:15.455,,...,TGBU4627859,,,5.0,,SYSTEM,,,PNCOC010,
999996,,,SYSTEM,SYSTEM,bce6383a7c311234136d82ae372dc66d,,2023-11-05 15:00:15.455,OK,2023-11-05 15:00:15.455,,...,MSMU6423243,,,5.0,,SYSTEM,,,PNCOC010,
999997,,,SYSTEM,SYSTEM,bf02c9f7b76c4d6804fdcd5b6031be59,,2023-11-05 15:00:15.455,OK,2023-11-05 15:00:15.455,,...,MRKU6031050,,,5.0,,SYSTEM,,,PNCOC010,
999998,,,SYSTEM,SYSTEM,d28605d392dcc1b7cd0a83b9096fbec9,,2023-11-05 15:00:15.455,OK,2023-11-05 15:00:15.455,,...,TCNU3597723,,,5.0,,SYSTEM,,,PNCOC010,


---

# 04. join 결과 JSON 파일 저장

In [16]:
start_time = time.time()
merged_df.to_json('merged_df.json', orient='records')
end_time = time.time()

saving_time_json = end_time - start_time

print("JSON파일 저장 시간:", saving_time, "초")

JSON파일 저장 시간: 6.44290828704834 초


In [23]:
pd.read_json('merged_df.json')

Unnamed: 0,ctiorReusltSts2,remark_x,rgstId_x,chgId_x,copionSeq,ctiorReusltSts3,chgDtm,ctiorReusltSts,rgstDem_x,ctiorReusltSts5,...,ctioCntrNo1,carrierId,ctioCntr1p2Iso,ctioCntrNo1FmFlag,ctioReserved3,chgId_y,ctioReserved1,ctioFaultDesc,tmnlId,remark_y
0,,,SYSTEM,SYSTEM,bbbef37a562eef7d6104c00e56eafb43,,1707231617101,OK,1707231617101,,...,GAOU6602217,,,5,,SYSTEM,,,HPNTC010,
1,,,SYSTEM,SYSTEM,bbb25a24cf25fc9e090c683532bc7176,,1707231617101,OK,1707231617101,,...,,,,4,,SYSTEM,,,HPNTC010,
2,,,SYSTEM,SYSTEM,de7de91a1187b4363d16fd9957ac8f9f,,1707231617101,OK,1707231617101,,...,,,,4,,SYSTEM,,,HPNTC010,
3,,,SYSTEM,SYSTEM,4c660f46d77c02779365cb2b65e935a6,,1707231617101,OK,1707231617101,,...,BMOU9804188,,,5,,SYSTEM,,,HPNTC010,
4,,,SYSTEM,SYSTEM,d8db7a4b2521842f438f4e4aa0dddcb0,,1707231617101,OK,1707231617101,,...,,,,4,,SYSTEM,,,HPNTC010,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,,,SYSTEM,SYSTEM,1a54f1e683365dc8d93e7e2c1801a3bc,,1699196415455,OK,1699196415455,,...,TGBU4627859,,,5,,SYSTEM,,,PNCOC010,
999996,,,SYSTEM,SYSTEM,bce6383a7c311234136d82ae372dc66d,,1699196415455,OK,1699196415455,,...,MSMU6423243,,,5,,SYSTEM,,,PNCOC010,
999997,,,SYSTEM,SYSTEM,bf02c9f7b76c4d6804fdcd5b6031be59,,1699196415455,OK,1699196415455,,...,MRKU6031050,,,5,,SYSTEM,,,PNCOC010,
999998,,,SYSTEM,SYSTEM,d28605d392dcc1b7cd0a83b9096fbec9,,1699196415455,OK,1699196415455,,...,TCNU3597723,,,5,,SYSTEM,,,PNCOC010,


---

## 04. 전체시간

In [19]:
IoResult_total + InOut_total + joining_time + saving_time

198.70015144348145

In [1]:
198.70015144348145/60

3.311669190724691

- MongoDB에서 Join 후에 가져오는 방식 >> 298초 소요됨.