#### MongoDB에 데이터 insert

필요 라이브러리 사전 설치

In [None]:
#%pip install wheel
#%pip install pipwin
#%pip install geopandas
#%pip install openpyxl
#%pip install python-dotenv

In [1]:
import os
import geopandas as gpd
import pandas as pd
from fiona.crs import from_epsg
from shapely.geometry import Point
from pymongo import MongoClient
from dotenv import load_dotenv

In [2]:
load_dotenv()
os.chdir('../')
os.getcwd()

'c:\\Users\\11\\Desktop\\LCG\\Programming\\project\\wedrive'

In [3]:
# 센서스 데이터
location_label_path=os.getcwd()+'/data/행정구역구분/BND_SIGUNGU_PG.shp'

# 데이터프레임 칼럼 지정
column_name=['uuid','time_begin','time_end','origin_lat','origin_lng','destination_lat','destination_lng','distance','heart','rank']

In [None]:
# 행정동 데이터 좌표계 변환
location_label=gpd.read_file(location_label_path, encoding='euc-kr')
location_label.to_crs(epsg=4326, inplace=True)

#ts=location_label['SIGUNGU_NM'].value_counts().head(10)
location_label[50:100]

In [5]:
# 특별시 및 광역시 전처리
metropolitan_city={'11':'서울', '21':'부산', '22':'대구', '23':'인천', '24':'광주', '25':'대전', '26':'울산'}
def update_dong(row):
    if row['SIGUNGU_CD'][:2] in metropolitan_city.keys():
        region_prefix = metropolitan_city.get(row['SIGUNGU_CD'][:2], '')
        if region_prefix:
            return f"{region_prefix} {row['SIGUNGU_NM']}"
    return row['SIGUNGU_NM']

# 센서스 데이터 정보 불러오기
census_col=['시도코드','시도명칭','시군구코드','시군구명칭','읍면동코드','읍면동명칭']
raw_census=pd.read_excel(os.getcwd()+'/data/행정구역구분/센서스_공간정보_지역_코드.xlsx')
def make_census_dict(raw_census):
    raw_census=raw_census.drop(index=0, axis=1)
    data=raw_census.values.tolist()
    census=pd.DataFrame(data, columns=census_col)
    census=census.set_index('시도코드')
    census=census['시도명칭']
    census=census.drop_duplicates()
    census_dict=census.to_dict()
    return census_dict
census_dict=make_census_dict(raw_census)

#지역 구분 칼럼 추가
def Region_col_add(row):
    if int(row['SIGUNGU_CD'][:2]) in census_dict.keys():
        region = census_dict.get(int(row['SIGUNGU_CD'][:2]), '')
        return region



In [None]:
location_label['SIGUNGU_NM'] = location_label.apply(update_dong, axis=1)
location_label['Region'] = location_label.apply(Region_col_add, axis=1)
location_label

In [4]:
# raw data에서 칼럼 추가하기
def Make_column(csv_file):
    raw_data=pd.read_csv(csv_file)
    data=raw_data.values.tolist()
    col_data=pd.DataFrame(data, columns=column_name)
    return col_data

# 데이터프레임에 destination_loc 칼럼 추가
def Make_column_destination_loc(csv_file):
    csv_file=Make_column(csv_file)

    #종료 위치 Point 객체
    end_geometry_Point=gpd.points_from_xy(csv_file['destination_lng'], csv_file['destination_lat'])
    coulumns = ['destination_lat', 'destination_lng']
    destination_gdf_data = gpd.GeoDataFrame(csv_file[coulumns], geometry=end_geometry_Point, crs=from_epsg(4326))
    destination_gdf_data['end_geometry'] = destination_gdf_data.apply(lambda row : Point([row['destination_lng'], row['destination_lat']]), axis=1)
    
    # Point 객체에 지역을 매핑시킨 데이터프레임 생성
    df=gpd.sjoin(destination_gdf_data, location_label)
    df.drop(columns=['index_right'], axis=1, inplace=True)
    #최종 데이터 프레임 생성
    csv_file['destination_area']=df['SIGUNGU_NM']
    csv_file['region']=df['Region']
    final_csv=csv_file[['uuid','destination_lat','destination_lng','region','destination_area']]
    return final_csv


In [None]:
filename=os.getcwd()+'/data/od_uuid/2024_1/00a152a698f34dc4b75112faef00b879.csv'
test_dataframe=Make_column_destination_loc(filename)
test_dataframe

In [5]:
client = MongoClient(os.getenv('DB_ADR'),
          username=os.getenv('DB_USER'),
          password=os.getenv('DB_PASSWORD'),
          authSource=os.getenv('DB_AuthSource'),
          authMechanism=os.getenv('DB_AuthMechanism'))

db = client.get_database(os.getenv('DB_Collection'))
cl = db.get_collection('user_coordinate_area_2')

# 데이터프레임에서 데이터를 추출하여 dictionary 형태로 변경한 후, MongoDB에 데이터 삽입
def dataframe_to_dict(dataframe, collection):
    if len(dataframe)!=0:
        collection.insert_many(dataframe.to_dict('records'))

In [6]:
# 디렉터리 내의 파일의 원본을 데이터베이스에 insert
def insert_to_database_raw_data(dir_path, collection):
    for file in os.listdir(dir_path):
        if file.endswith(".csv"):
            file_path=os.path.join(dir_path, file)
            dataframe_to_dict(Make_column(file_path), collection)

# 디렉터리 내 파일 모두 데이터베이스에 insert
def insert_to_database(dir_path,collection):
    for file in os.listdir(dir_path):
        if file.endswith(".csv"):
            file_path=os.path.join(dir_path, file)
            dataframe_to_dict(Make_column_destination_loc(file_path), collection)


In [7]:
# 디렉터리 경로
path_directory_2020_1=os.getcwd()+'/data/od_uuid/2020_1/'
path_directory_2020_2=os.getcwd()+'/data/od_uuid/2020_2/'
path_directory_2021_1=os.getcwd()+'/data/od_uuid/2021_1/'
path_directory_2021_2=os.getcwd()+'/data/od_uuid/2021_2/'

# DB 컬렉션 설정
cl_raw_2020 = db.get_collection('raw_od_uuid_2020')
cl_raw_2021 = db.get_collection('raw_od_uuid_2021')

# 원본 데이터 DB에 삽입
insert_to_database_raw_data(path_directory_2020_1, cl_raw_2020)
insert_to_database_raw_data(path_directory_2020_2, cl_raw_2020)

insert_to_database_raw_data(path_directory_2021_1, cl_raw_2021)
insert_to_database_raw_data(path_directory_2021_2, cl_raw_2021)


In [None]:
# 도착지 데이터 인서트
#insert_to_database(path_directory)
#insert_to_database(path_directory2)

In [21]:
# 유저 빈도수 조사
cl = db.get_collection('user_area_frequency_2')

def frequency(dataframe):
    freq=dataframe[['region','destination_area']].value_counts().reset_index()
    freq.columns=['wide-area','basic-unit', 'frequency']
    uuid=dataframe['uuid'][0]
    freq['uuid']=uuid
    final_df=freq[['uuid','wide-area','basic-unit', 'frequency']]
    return final_df

def insert_freq_to_database(dir_path):
    for file in os.listdir(dir_path):
        if file.endswith(".csv"):
            file_path=os.path.join(dir_path, file)
            result_df=Make_column_destination_loc(file_path)
            if len(result_df)!=0:
                dataframe_to_dict(frequency(result_df))

In [None]:
#insert_freq_to_database(path_directory)
#insert_freq_to_database(path_directory2)