In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# MySQL 접속 설정
username = "root"      # MySQL 사용자명
password = "1234"      # MySQL 비밀번호
host = "127.0.0.1"     # 로컬호스트
port = 3306            # 포트
database = "ott_db"  # DB 이름

# MySQL 엔진 생성
engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

In [3]:
# 데이터 파일 경로 설정
folder_path = r"방송통신위원회_해외 OTT 이용행태조사 원시데이터_20231231"
file_2022 = os.path.join(folder_path, "2022해외OTT이용행태조사_국가통합(20240403).csv")
file_2023 = os.path.join(folder_path, "2023해외OTT이용행태조사_국가통합(20240403).csv")

# CSV 읽기
df_2022 = pd.read_csv(file_2022, encoding='utf-8-sig', low_memory=False)
df_2023 = pd.read_csv(file_2023, encoding='utf-8-sig', low_memory=False)

print("2022 데이터:", df_2022.shape)
print("2023 데이터:", df_2023.shape)

2022 데이터: (4536, 1903)
2023 데이터: (6326, 929)


## 3. contents
### (1) user_id

In [4]:
# 1. 2022 user_id 생성
df_2022['user_id'] = ('2022' + df_2022['RESPID'].astype(str)).astype(int)
# 2. 2023 user_id 생성
df_2023['user_id'] = ('2023' + df_2023['id'].astype(str)).astype(int)

df_user_id_2022 = df_2022[['user_id']].copy()
df_user_id_2023 = df_2023[['user_id']].copy()

# 3. 합치기
df_user_id = pd.concat([df_user_id_2022, df_user_id_2023], ignore_index=True)

# 4. contents 테이블에 user_id 업로드
df_user_id.to_sql(name="contents", con=engine, index=False, if_exists="replace")

10862

In [5]:
def add_column_to_contents(df_contents, df_2022, df_2023, col_2022, col_2023, new_col_name,
                       mapping_2022=None, mapping_2023=None, default_value='이용안함'):
    """
    contents 테이블에 새로운 컬럼을 추가하는 함수
    """
    # 2022 데이터 처리
    df_2022_col = df_2022[['user_id', col_2022]].copy()
    df_2022_col = df_2022_col.rename(columns={col_2022: new_col_name})
    df_2022_col[new_col_name] = pd.to_numeric(df_2022_col[new_col_name], errors='coerce')  # 숫자 변환
    if mapping_2022:
        df_2022_col[new_col_name] = df_2022_col[new_col_name].map(mapping_2022).fillna(default_value)
    else:
        df_2022_col[new_col_name] = df_2022_col[new_col_name].fillna(default_value)

    # 2023 데이터 처리
    df_2023_col = df_2023[['user_id', col_2023]].copy()
    df_2023_col = df_2023_col.rename(columns={col_2023: new_col_name})
    df_2023_col[new_col_name] = pd.to_numeric(df_2023_col[new_col_name], errors='coerce')  # 숫자 변환
    if mapping_2023:
        df_2023_col[new_col_name] = df_2023_col[new_col_name].map(mapping_2023).fillna(default_value)
    else:
        df_2023_col[new_col_name] = df_2023_col[new_col_name].fillna(default_value)

    # 합치기
    df_new_col = pd.concat([df_2022_col, df_2023_col], ignore_index=True)

    # contents 테이블 병합
    df_contents = df_contents.merge(df_new_col, on='user_id', how='left')

    return df_contents

In [6]:
# 현재 contents 테이블 불러오기
df_contents = pd.read_sql("SELECT * FROM contents", con=engine)

### (1) frequency_series

In [7]:
frequency_map = {
    1: '매일',
    2: '1주일에 5~6일',
    3: '1주일에 3~4일',
    4: '1주일에 1~2일',
    5: '한 달에 1~3일',
    6: '2~3달에 1~2일 이하'
}

df_contents = add_column_to_contents(
    df_contents, df_2022, df_2023,
    col_2022="D1_1", col_2023="C1_1",
    new_col_name="frequency_series",
    mapping_2022=frequency_map,
    mapping_2023=frequency_map
)

### (2) frequency_movie

In [8]:
df_contents = add_column_to_contents(
    df_contents, df_2022, df_2023,
    col_2022="D1_2", col_2023="C1_2",
    new_col_name="frequency_movie",
    mapping_2022=frequency_map,
    mapping_2023=frequency_map
)

### (3) frequency_variety

In [9]:
df_contents = add_column_to_contents(
    df_contents, df_2022, df_2023,
    col_2022="D1_3", col_2023="C1_3",
    new_col_name="frequency_variety",
    mapping_2022=frequency_map,
    mapping_2023=frequency_map
)

In [10]:
def add_genre_column_2022(df_contents, df_2022, col_2022, new_col_name, target_code, convert_value=1):
    """
    contents 테이블에 2022년 장르 관련 컬럼 추가
    """
    df_2022_part = df_2022[['user_id', col_2022]].copy()
    df_2022_part = df_2022_part.rename(columns={col_2022: new_col_name})

    df_2022_part[new_col_name] = (
        df_2022_part[new_col_name]
        .replace(' ', pd.NA)
        .fillna(pd.NA)
        .astype("Int64")
        .apply(lambda x: convert_value if x == target_code else 0 if pd.notna(x) else pd.NA)
    ).astype("Int64")

    df_contents = df_contents.merge(df_2022_part, on='user_id', how='left')
    return df_contents


In [11]:
def add_genre_column_2023(df_contents, df_2023, cols_2023, new_col_name, target_code, convert_value=1):
    """
    contents 테이블에 2023년 장르 관련 컬럼 추가
    """
    df_2023_part = df_2023[['user_id'] + cols_2023].copy()

    # 공백 및 NaN 처리
    df_2023_part[cols_2023] = df_2023_part[cols_2023].replace(' ', pd.NA)
    df_2023_part[cols_2023] = df_2023_part[cols_2023].apply(pd.to_numeric, errors='coerce')

    # target_code 포함 여부 확인
    df_2023_part[new_col_name] = df_2023_part[cols_2023].apply(
        lambda row: convert_value if target_code in row.dropna().values else 0, axis=1
    ).astype("Int64")

    df_2023_part = df_2023_part[['user_id', new_col_name]]
    df_contents = df_contents.merge(df_2023_part, on='user_id', how='left')
    return df_contents

### (4) genre_series_movie_rom_2022, genre_series_movie_rom_2023

In [12]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_1',
    new_col_name='genre_series_movie_rom_2022',
    target_code=1
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_rom_2023',
    target_code=1
)

### (5) genre_series_movie_fant_2022, genre_series_movie_fant_2023

In [13]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_2',
    new_col_name='genre_series_movie_fant_2022',
    target_code=2
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_fant_2023',
    target_code=2
)

### (6) genre_series_movie_action_2022, genre_series_movie_action_2023

In [14]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_3',
    new_col_name='genre_series_movie_action_2022',
    target_code=3
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_action_2023',
    target_code=3
)

### (7) genre_series_movie_horr_2022, genre_series_movie_horr_2023

In [15]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_4',
    new_col_name='genre_series_movie_horr_2022',
    target_code=4
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_horr_2023',
    target_code=4
)

### (8) genre_series_movie_comedy_2023

In [16]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_comedy_2023',
    target_code=5
)

### (9) genre_series_movie_history_2022, genre_series_movie_history_2023

In [17]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_5',
    new_col_name='genre_series_movie_history_2022',
    target_code=5
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_history_2023',
    target_code=6
)

### (10) genre_series_movie_human_2022, genre_series_movie_human_2023

In [18]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_6',
    new_col_name='genre_series_movie_human_2022',
    target_code=6
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_human_2023',
    target_code=7
)

### (11) genre_series_movie_docu_2022, genre_series_movie_docu_2023

In [19]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_7',
    new_col_name='genre_series_movie_docu_2022',
    target_code=7
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_docu_2023',
    target_code=8
)

### (12) genre_series_movie_musical_2022, genre_series_movie_musical_2023

In [20]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_8',
    new_col_name='genre_series_movie_musical_2022',
    target_code=8
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_musical_2023',
    target_code=9
)

### (13) genre_series_movie_ani_2023

In [21]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_ani_2023',
    target_code=10
)

### (14) genre_series_movie_kids_2023

In [22]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_kids_2023',
    target_code=11
)

### (15) genre_series_movie_teleno_2023

In [23]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_teleno_2023',
    target_code=12
)

### (16) genre_series_movie_etc_2022, genre_series_movie_etc_2023

In [24]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D2_9',
    new_col_name='genre_series_movie_etc_2022',
    target_code=9
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C3', 'C3_m2', 'C3_m3', 'C3_m4', 'C3_m5',
        'C3_m6', 'C3_m7', 'C3_m8', 'C3_m9', 'C3_m10',
        'C3_m11', 'C3_m12', 'C3_m13'
    ],
    new_col_name='genre_series_movie_etc_2023',
    target_code=13
)

### (17) genre_variety_game_2022, genre_variety_game_2023

In [25]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D3_1',
    new_col_name='genre_variety_game_2022',
    target_code=1
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_game_2023',
    target_code=1
)

### (18) genre_variety_travel_2022, genre_variety_travel_2023

In [26]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D3_2',
    new_col_name='genre_variety_travel_2022',
    target_code=2
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_travel_2023',
    target_code=2
)

### (19) genre_variety_observe_2022, genre_variety_observe_2023

In [27]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D3_3',
    new_col_name='genre_variety_observe_2022',
    target_code=3
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_observe_2023',
    target_code=3
)

### (20) genre_variety_reality_2022, genre_variety_reality_2023

In [28]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D3_4',
    new_col_name='genre_variety_reality_2022',
    target_code=4
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_reality_2023',
    target_code=4
)

### (21) genre_variety_audition_2023

In [29]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_audition_2023',
    target_code=5
)

### (22) genre_variety_survival_2022, genre_variety_survival_2023

In [30]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D3_5',
    new_col_name='genre_variety_survival_2022',
    target_code=5
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_survival_2023',
    target_code=6
)

### (23) genre_variety_culture_2022, genre_variety_culture_2023

In [31]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D3_6',
    new_col_name='genre_variety_culture_2022',
    target_code=6
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_culture_2023',
    target_code=7
)

### (24) genre_variety_counsel_2023

In [32]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_counsel_2023',
    target_code=8
)

### (25) genre_variety_edu_2023

In [33]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_edu_2023',
    target_code=9
)

### (26) genre_variety_talkshow_2023

In [34]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_talkshow_2023',
    target_code=10
)

### (27) genre_variety_quizshow_2023

In [35]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_quizshow_2023',
    target_code=11
)

### (28) genre_variety_varshow_2023

In [36]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_varshow_2023',
    target_code=12
)

### (29) genre_variety_married_2023

In [37]:
df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_married_2023',
    target_code=13
)

### (30) genre_variety_etc_2022, genre_variety_etc_2023

In [38]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D3_7',
    new_col_name='genre_variety_etc_2022',
    target_code=7
)

df_contents = add_genre_column_2023(
    df_contents, df_2023,
    cols_2023=[
        'C6', 'C6_m2', 'C6_m3', 'C6_m4', 'C6_m5',
        'C6_m6', 'C6_m7', 'C6_m8', 'C6_m9', 'C6_m10',
        'C6_m11', 'C6_m12', 'C6_m13', 'C6_m14'
    ],
    new_col_name='genre_variety_etc_2023',
    target_code=14
)

### (31) recog_route_recom_2022

In [39]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_1',
    new_col_name='recog_route_recom_2022',
    target_code=1
)

### (32) recog_route_media_2022

In [40]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_2',
    new_col_name='recog_route_media_2022',
    target_code=2
)

### (33) recog_route_fav_actor_2022

In [41]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_3',
    new_col_name='recog_route_fav_actor_2022',
    target_code=3
)

### (34) recog_route_fav_singer_2022

In [42]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_4',
    new_col_name='recog_route_fav_singer_2022',
    target_code=4
)

### (35) recog_route_celeb_recom_2022

In [43]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_5',
    new_col_name='recog_route_celeb_recom_2022',
    target_code=5
)

### (36) recog_route_algorithm_2022

In [44]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_6',
    new_col_name='recog_route_algorithm_2022',
    target_code=6
)

### (37) recog_route_search_2022

In [45]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_7',
    new_col_name='recog_route_search_2022',
    target_code=7
)

### (38) recog_route_etc_2022

In [46]:
df_contents = add_genre_column_2022(
    df_contents, df_2022,
    col_2022='D5_8',
    new_col_name='recog_route_etc_2022',
    target_code=8
)

### (39) recog_route_2023

In [47]:
# 매핑
recog_map_2023 = {
    1: '주변 지인의 추천/후기',
    2: '기사, 광고, SNS 등 미디어 노출',
    3: '좋아하는 배우가 출연한 콘텐츠',
    4: '좋아하는 가수의 배우 데뷔',
    5: '좋아하는 연예인의 추천',
    6: 'OTT 서비스 알고리즘 추천',
    7: '장르, 출연진, 연출진 등을 검색하다가',
    8: '기타'
}

# user_id와 C8만 추출
df_recog = df_2023[['user_id', 'C8']].copy()

# 컬럼명 변경 및 매핑
df_recog = df_recog.rename(columns={'C8': 'recog_route_2023'})
df_recog['recog_route_2023'] = df_recog['recog_route_2023'].replace(' ', pd.NA)
df_recog['recog_route_2023'] = pd.to_numeric(df_recog['recog_route_2023'], errors='coerce')
df_recog['recog_route_2023'] = df_recog['recog_route_2023'].map(recog_map_2023)

# contents에 병합
df_contents = df_contents.merge(df_recog, on='user_id', how='left')

### (40) watch_recommendation

In [48]:
def add_binary_column_from_yesno(df_contents, df_2022, df_2023,
                                 col_2022, col_2023, new_col_name):
    """
    '예=1, 아니오=0'로 이진 변환한 컬럼을 contents 테이블에 추가하는 함수
    - col_2022: 2022년도 원본 컬럼명
    - col_2023: 2023년도 원본 컬럼명
    - new_col_name: 최종 contents에 들어갈 컬럼명
    """

    # 2022 처리
    df_2022_part = df_2022[['user_id', col_2022]].copy()
    df_2022_part = df_2022_part.rename(columns={col_2022: new_col_name})
    df_2022_part[new_col_name] = df_2022_part[new_col_name].replace(' ', pd.NA)
    df_2022_part[new_col_name] = pd.to_numeric(df_2022_part[new_col_name], errors='coerce')
    df_2022_part[new_col_name] = df_2022_part[new_col_name].map({1: 1, 2: 0})

    # 2023 처리
    df_2023_part = df_2023[['user_id', col_2023]].copy()
    df_2023_part = df_2023_part.rename(columns={col_2023: new_col_name})
    df_2023_part[new_col_name] = df_2023_part[new_col_name].replace(' ', pd.NA)
    df_2023_part[new_col_name] = pd.to_numeric(df_2023_part[new_col_name], errors='coerce')
    df_2023_part[new_col_name] = df_2023_part[new_col_name].map({1: 1, 2: 0})

    # 통합
    df_merge = pd.concat([df_2022_part, df_2023_part], ignore_index=True)

    # 병합
    df_contents = df_contents.merge(df_merge, on='user_id', how='left')

    return df_contents

In [49]:
df_contents = add_binary_column_from_yesno(
    df_contents,
    df_2022, df_2023,
    col_2022='D6_1',
    col_2023='C9',    
    new_col_name='watch_recommendation'
)

### (41) watch_search

In [50]:
df_contents = add_binary_column_from_yesno(
    df_contents,
    df_2022, df_2023,
    col_2022='D6_2',
    col_2023='C9_n2',
    new_col_name='watch_search'
)

### (42) watch_original

In [51]:
df_contents = add_binary_column_from_yesno(
    df_contents,
    df_2022, df_2023,
    col_2022='D6_3',
    col_2023='C9_n3',
    new_col_name='watch_original'
)

### (43) watch_full_single

In [52]:
df_contents = add_binary_column_from_yesno(
    df_contents,
    df_2022, df_2023,
    col_2022='D6_4',
    col_2023='C9_n4',
    new_col_name='watch_full_single'
)

### (44) watch_binge

In [53]:
df_contents = add_binary_column_from_yesno(
    df_contents,
    df_2022, df_2023,
    col_2022='D6_5',
    col_2023='C9_n5',
    new_col_name='watch_binge'
)

### (45) watch_unsubscribe

In [54]:
df_contents = add_binary_column_from_yesno(
    df_contents,
    df_2022, df_2023,
    col_2022='D6_6',
    col_2023='C9_n6',
    new_col_name='watch_unsubscribe'
)

### (46) ott_live

In [55]:
df_contents = add_column_to_contents(
    df_contents,
    df_2022, df_2023,
    col_2022='D7',
    col_2023='C10',
    new_col_name='ott_live'
)

### (47) frequency_music

In [56]:
df_contents = add_column_to_contents(
    df_contents,
    df_2022, df_2023,
    col_2022='D10_4',
    col_2023='C13_1',
    new_col_name='frequency_music',
    mapping_2022=frequency_map,
    mapping_2023=frequency_map
)

### (48) frequency_webtoon

In [57]:
df_contents = add_column_to_contents(
    df_contents,
    df_2022, df_2023,
    col_2022='D10_7',
    col_2023='C13_2',
    new_col_name='frequency_webtoon',
    mapping_2022=frequency_map,
    mapping_2023=frequency_map
)

### (49) frequency_webnovel

In [58]:
df_contents = add_column_to_contents(
    df_contents,
    df_2022, df_2023,
    col_2022='D10_6',
    col_2023='C13_3',
    new_col_name='frequency_webnovel',
    mapping_2022=frequency_map,
    mapping_2023=frequency_map
)

### (50) viewed_series

In [59]:
def add_viewed_column(df_contents, df_2022, df_2023,
                      col_2022, cols_2023, new_col_name,
                      target_code, convert_value=1):
    """
    contents 테이블에 시청 여부 관련 컬럼을 추가하는 함수
    """

    # 2022 처리
    df_2022_part = df_2022[['user_id', col_2022]].copy()
    df_2022_part = df_2022_part.rename(columns={col_2022: new_col_name})

    df_2022_part[new_col_name] = (
        df_2022_part[new_col_name]
        .replace(' ', pd.NA)
        .apply(pd.to_numeric, errors='coerce')  # 숫자형 변환
        .apply(lambda x: convert_value if x == target_code else 0 if pd.notna(x) else pd.NA)
    ).astype("Int64")

    # 2023 처리
    df_2023_part = df_2023[['user_id'] + cols_2023].copy()
    df_2023_part[cols_2023] = df_2023_part[cols_2023].replace(' ', pd.NA)
    df_2023_part[cols_2023] = df_2023_part[cols_2023].apply(pd.to_numeric, errors='coerce')

    def contains_target(row):
        values = row.dropna().astype("Int64").values
        return convert_value if target_code in values else 0

    df_2023_part[new_col_name] = df_2023_part[cols_2023].apply(contains_target, axis=1).astype("Int64")
    df_2023_part = df_2023_part[['user_id', new_col_name]]

    # 병합
    df_merge = pd.concat([df_2022_part, df_2023_part], ignore_index=True)
    df_contents = df_contents.merge(df_merge, on='user_id', how='left')

    return df_contents


In [60]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_1',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_series',
    target_code=1
)

### (51) viewed_movie

In [61]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_2',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_movie',
    target_code=2
)

### (52) viewed_variety

In [62]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_3',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_variety',
    target_code=3
)

### (53) viewed_music

In [63]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_4',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_music',
    target_code=4
)

### (54) viewed_animation

In [64]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_5',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_animation',
    target_code=5
)

### (55) viewed_webtoon

In [65]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_6',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_webtoon',
    target_code=6
)

### (56) viewed_webnovel

In [66]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_7',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_webnovel',
    target_code=7
)

### (57) viewed_game

In [67]:
df_contents = add_viewed_column(
    df_contents, df_2022, df_2023,
    col_2022='SQ7_8',
    cols_2023=[
        'SQ7', 'SQ7_m2', 'SQ7_m3','SQ7_m4', 
        'SQ7_m5', 'SQ7_m6','SQ7_m7', 'SQ7_m8'
    ],
    new_col_name='viewed_game',
    target_code=8
)

In [68]:
df_contents

Unnamed: 0,user_id,frequency_series,frequency_movie,frequency_variety,genre_series_movie_rom_2022,genre_series_movie_rom_2023,genre_series_movie_fant_2022,genre_series_movie_fant_2023,genre_series_movie_action_2022,genre_series_movie_action_2023,...,frequency_webtoon,frequency_webnovel,viewed_series,viewed_movie,viewed_variety,viewed_music,viewed_animation,viewed_webtoon,viewed_webnovel,viewed_game
0,20228,1주일에 3~4일,1주일에 3~4일,이용안함,0,,0,,1,,...,이용안함,이용안함,1,1,0,0,0,0,0,0
1,202211,1주일에 3~4일,1주일에 3~4일,1주일에 1~2일,1,,0,,1,,...,이용안함,이용안함,1,1,1,1,0,0,0,0
2,202212,매일,1주일에 3~4일,매일,1,,0,,0,,...,이용안함,이용안함,1,1,1,1,1,0,0,0
3,202213,1주일에 1~2일,1주일에 1~2일,2~3달에 1~2일 이하,1,,1,,1,,...,이용안함,이용안함,1,1,1,0,0,0,0,0
4,202215,1주일에 3~4일,1주일에 1~2일,1주일에 5~6일,1,,0,,1,,...,1주일에 3~4일,1주일에 1~2일,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10857,202310298,매일,매일,매일,,0,,1,,0,...,1주일에 5~6일,매일,1,1,1,1,1,1,1,0
10858,202310299,한 달에 1~3일,1주일에 1~2일,이용안함,,0,,0,,0,...,이용안함,이용안함,1,1,0,1,0,0,0,0
10859,202310300,매일,1주일에 5~6일,1주일에 5~6일,,1,,1,,0,...,1주일에 3~4일,1주일에 5~6일,1,1,1,1,1,1,1,0
10860,202310301,1주일에 1~2일,1주일에 1~2일,이용안함,,1,,0,,1,...,이용안함,이용안함,1,1,0,1,0,0,0,0


In [69]:
# 최종 DB 업로드
df_contents.to_sql(name="contents", con=engine, index=False, if_exists="replace")
print("contents 테이블 최종 업데이트 완료!")

contents 테이블 최종 업데이트 완료!
