In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# MySQL 접속 설정
username = "root"      # MySQL 사용자명
password = "1234"      # MySQL 비밀번호
host = "127.0.0.1"     # 로컬호스트
port = 3306            # 포트
database = "ott_db"  # DB 이름

# MySQL 엔진 생성
engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

In [3]:
# 데이터 파일 경로 설정
folder_path = r"방송통신위원회_해외 OTT 이용행태조사 원시데이터_20231231"
file_2022 = os.path.join(folder_path, "2022해외OTT이용행태조사_국가통합(20240403).csv")
file_2023 = os.path.join(folder_path, "2023해외OTT이용행태조사_국가통합(20240403).csv")

# CSV 읽기
df_2022 = pd.read_csv(file_2022, encoding='utf-8-sig', low_memory=False)
df_2023 = pd.read_csv(file_2023, encoding='utf-8-sig', low_memory=False)

print("2022 데이터:", df_2022.shape)
print("2023 데이터:", df_2023.shape)

2022 데이터: (4536, 1903)
2023 데이터: (6326, 929)


## 1. User
### (1) user_id

In [4]:
# 1. 2022 user_id 생성
df_2022['user_id'] = ('2022' + df_2022['RESPID'].astype(str)).astype(int)
# 2. 2023 user_id 생성
df_2023['user_id'] = ('2023' + df_2023['id'].astype(str)).astype(int)

df_user_id_2022 = df_2022[['user_id']].copy()
df_user_id_2023 = df_2023[['user_id']].copy()

# 3. 합치기
df_user_id = pd.concat([df_user_id_2022, df_user_id_2023], ignore_index=True)

# 4. user 테이블에 user_id 업로드
df_user_id.to_sql(name="user", con=engine, index=False, if_exists="replace")

10862

In [5]:
def add_column_to_user(df_user, df_2022, df_2023, col_2022, col_2023, new_col_name,
                       mapping_2022=None, mapping_2023=None):
    """
    user 테이블에 새로운 컬럼을 추가하는 함수
    :param df_user: 기존 user 테이블 DataFrame
    :param df_2022: 2022 데이터 DataFrame
    :param df_2023: 2023 데이터 DataFrame
    :param col_2022: 2022 데이터에서 가져올 컬럼명
    :param col_2023: 2023 데이터에서 가져올 컬럼명
    :param new_col_name: user 테이블에 추가할 새 컬럼명
    :param mapping_2022: 2022 매핑 딕셔너리 (없으면 None)
    :param mapping_2023: 2023 매핑 딕셔너리 (없으면 None)
    :return: 새로운 컬럼이 추가된 df_user
    """

    # 2022 데이터 처리
    df_2022_col = df_2022[['user_id', col_2022]].copy()
    df_2022_col = df_2022_col.rename(columns={col_2022: new_col_name})
    if mapping_2022:
        df_2022_col[new_col_name] = df_2022_col[new_col_name].map(mapping_2022)

    # 2023 데이터 처리
    df_2023_col = df_2023[['user_id', col_2023]].copy()
    df_2023_col = df_2023_col.rename(columns={col_2023: new_col_name})
    if mapping_2023:
        df_2023_col[new_col_name] = df_2023_col[new_col_name].map(mapping_2023)

    # 합치기
    df_new_col = pd.concat([df_2022_col, df_2023_col], ignore_index=True)

    # user 테이블 병합
    df_user = df_user.merge(df_new_col, on='user_id', how='left')

    return df_user

In [6]:
# 현재 user 테이블 불러오기
df_user = pd.read_sql("SELECT * FROM user", con=engine)

### (2) nation

In [7]:
nation_map_2022 = {1: '미국', 2: '대만', 3: '인도네시아'}
nation_map_2023 = {1: '브라질', 2: '멕시코', 3: '영국', 4: '일본'}

df_user = add_column_to_user(
    df_user, df_2022, df_2023,
    col_2022='nation', col_2023='SQ0',
    new_col_name='nation',
    mapping_2022=nation_map_2022,
    mapping_2023=nation_map_2023
)

### (3) age

In [8]:
age_map = {
    1: '15-19세', 2: '20-24세', 3: '25-29세', 4: '30-34세',
    5: '35-39세', 6: '40-44세', 7: '45-49세', 8: '50-54세', 9: '그 외'
}

df_user = add_column_to_user(
    df_user, df_2022, df_2023,
    col_2022='SQ1', col_2023='SQ1',
    new_col_name='age',
    mapping_2022=age_map,
    mapping_2023=age_map
)

### (4) gender

In [9]:
gender_map_2022 = {1: '남성', 2: '여성', 98: '기타'}
gender_map_2023 = {1: '남성', 2: '여성', 3: '기타'}

df_user = add_column_to_user(
    df_user, df_2022, df_2023,
    col_2022='SQ2', col_2023='SQ2',
    new_col_name='gender',
    mapping_2022=gender_map_2022,
    mapping_2023=gender_map_2023
)

### (5) region

In [10]:
# 2022 region 매핑
region_map_2022 = {
    101: '서부', 102: '중부', 103: '남부', 104: '북동부',
    201: '서부', 202: '동부',
    301: '자바', 302: '수마트라 위주'
}
def map_region_2022(code):
    if pd.isna(code):  # 결측값 처리
        return None
    code = int(code)
    name = region_map_2022.get(code, None)
    if not name:
        return None
    if str(code).startswith('1'):
        return '미국 ' + name
    elif str(code).startswith('2'):
        return '대만 ' + name
    elif str(code).startswith('3'):
        return '인도네시아 ' + name
    return name

In [11]:
# 2023 region 매핑
region_map_2023 = {
    1: '상파울루', 2: '리우데자네이루', 3: '살바도르', 4: '포르탈레자', 5: '벨루오리존치',
    6: '브라질리아', 7: '쿠리치바', 8: '마나우스', 9: '헤시피', 10: '벨렝', 11: '그 외',
    12: '멕시코시티', 13: '이스타팔라파', 14: '에카테펙데모렐로스', 15: '과달라하라', 16: '푸에블라',
    17: '시우다드후아레스', 18: '티후아나', 19: '레온델로스알다마', 20: '구스타보아돌포마데로',
    21: '사포판', 22: '그 외',
    23: '런던', 24: '버밍엄', 25: '리버풀', 26: '셰필드', 27: '브리스톨', 28: '레스터',
    29: '리즈', 30: '글래스고', 31: '에든버러', 32: '카디프', 33: '스완지', 34: '뉴포트', 35: '그 외',
    36: '도쿄', 37: '요코하마', 38: '오사카', 39: '나고야', 40: '삿포로', 41: '고베', 42: '교토',
    43: '후쿠오카', 44: '가와사키', 45: '사이타마', 46: '그 외'
}
def map_region_2023(code):
    if pd.isna(code):
        return None
    code = int(code)
    name = region_map_2023.get(code, None)
    if not name:
        return None
    if 1 <= code <= 11:
        return '브라질 ' + name
    elif 12 <= code <= 22:
        return '멕시코 ' + name
    elif 23 <= code <= 35:
        return '영국 ' + name
    elif 36 <= code <= 46:
        return '일본 ' + name
    return name

In [12]:
# 2022 region 처리
df_region_2022 = df_2022[['user_id', 'SQ3']].copy()
df_region_2022 = df_region_2022.rename(columns={'SQ3': 'region'})
df_region_2022['region'] = df_region_2022['region'].apply(map_region_2022)

# 2023 region 처리
df_region_2023 = df_2023[['user_id', 'SQ3']].copy()
df_region_2023 = df_region_2023.rename(columns={'SQ3': 'region'})
df_region_2023['region'] = df_region_2023['region'].apply(map_region_2023)

# 합치기
df_region = pd.concat([df_region_2022, df_region_2023], ignore_index=True)

# user 테이블 병합
df_user = df_user.merge(df_region, on='user_id', how='left')

### (6) job

In [13]:
job_map_2022 = {
    1: '관리/경영직', 2: '자영업', 3: '농/임/어/축산업', 4: '전문직',
    5: '사무직', 6: '서비스직', 7: '판매/영업직', 8: '기술직',
    9: '단순노무직', 10: '전업주부', 11: '학생(초/중/고)', 12: '대학(원)생',
    13: '무직/퇴직', 98: '기타'
}
job_map_2023 = {
    1: '관리/경영직', 2: '자영업', 3: '농/임/어/축산업', 4: '전문직',
    5: '사무직', 6: '서비스직', 7: '판매/영업직', 8: '기술직',
    9: '단순노무직', 10: '전업주부', 11: '학생(초/중/고)', 12: '대학(원)생',
    13: '무직/퇴직', 14: '기타'
}

df_user = add_column_to_user(
    df_user, df_2022, df_2023,
    col_2022='DQ1', col_2023='DQ1',
    new_col_name='job',
    mapping_2022=job_map_2022,
    mapping_2023=job_map_2023
)

### (7) edu

In [14]:
edu_map = {
    1: '초/중/고/대학생(재학 중)',
    2: '초졸/중졸/고졸',
    3: '대졸',
    4: '대학원 석사과정 또는 졸업(석사)',
    5: '대학원 박사과정 또는 졸업(박사)'
}

df_user = add_column_to_user(
    df_user, df_2022, df_2023,
    col_2022='DQ2', col_2023='DQ2',
    new_col_name='edu',
    mapping_2022=edu_map,
    mapping_2023=edu_map
)

### (8) resident_children

In [15]:
children_map = {1: 1, 2: 0}
df_user = add_column_to_user(
    df_user, df_2022, df_2023,
    col_2022='SQ9', col_2023='SQ9',
    new_col_name='resident_children',
    mapping_2022=children_map,
    mapping_2023=children_map
)

In [16]:
def add_child_age_column(df_user, df_2022, df_2023, new_col_name, col_2022, cols_2023, target_value, convert_value=1):
    """
    user 테이블에 자녀 나이 컬럼 추가하는 함수
    """
    # 2022 데이터 처리
    df_2022_part = df_2022[['user_id', col_2022]].copy()
    df_2022_part = df_2022_part.rename(columns={col_2022: new_col_name})
    df_2022_part[new_col_name] = (
        df_2022_part[new_col_name]
        .replace(' ', 0)      # 공백을 0으로
        .fillna(0)            # NaN을 0으로
        .astype(int)          # 숫자로 변환
        .replace({target_value: convert_value})  # target_value → convert_value
    )
    df_2022_part[new_col_name] = df_2022_part[new_col_name].apply(lambda x: convert_value if x == convert_value else 0)

    # 2023 데이터 처리
    # 2023 데이터 처리
    df_2023_part = df_2023[['user_id'] + cols_2023].copy()
    df_2023_part[cols_2023] = df_2023_part[cols_2023].replace(' ', 0).fillna(0).astype(int)
    df_2023_part[new_col_name] = df_2023_part[cols_2023].apply(
        lambda row: 1 if (row == target_value).any() else 0, axis=1
    )

    df_2023_part = df_2023_part[['user_id', new_col_name]]

    # 합치기
    df_merge = pd.concat([df_2022_part, df_2023_part], ignore_index=True)

    # user 테이블 병합
    df_user = df_user.merge(df_merge, on='user_id', how='left')
    return df_user

### (9) child_age_under6

In [17]:
df_user = add_child_age_column(
    df_user,
    df_2022, df_2023,
    new_col_name='child_age_under6',
    col_2022='SQ9_1_1',
    cols_2023=['SQ9_1', 'SQ9_1_m2', 'SQ9_1_m3', 'SQ9_1_m4', 'SQ9_1_m5', 'SQ9_1_m6'],
    target_value=1,
    convert_value=1
)

### (10) child_age_7_9

In [18]:
df_user = add_child_age_column(
    df_user,
    df_2022, df_2023,
    new_col_name='child_age_7_9',
    col_2022='SQ9_1_2',
    cols_2023=['SQ9_1', 'SQ9_1_m2', 'SQ9_1_m3', 'SQ9_1_m4', 'SQ9_1_m5', 'SQ9_1_m6'],
    target_value=2,
    convert_value=1
)

### (11) child_age_10_12

In [19]:
df_user = add_child_age_column(
    df_user,
    df_2022, df_2023,
    new_col_name='child_age_10_12',
    col_2022='SQ9_1_3',
    cols_2023=['SQ9_1', 'SQ9_1_m2', 'SQ9_1_m3', 'SQ9_1_m4', 'SQ9_1_m5', 'SQ9_1_m6'],
    target_value=3,
    convert_value=1
)

### (12) child_age_13_15

In [20]:
df_user = add_child_age_column(
    df_user,
    df_2022, df_2023,
    new_col_name='child_age_13_15',
    col_2022='SQ9_1_4',
    cols_2023=['SQ9_1', 'SQ9_1_m2', 'SQ9_1_m3', 'SQ9_1_m4', 'SQ9_1_m5', 'SQ9_1_m6'],
    target_value=4,
    convert_value=1
)

### (13) child_age_16_18

In [21]:
df_user = add_child_age_column(
    df_user,
    df_2022, df_2023,
    new_col_name='child_age_16_18',
    col_2022='SQ9_1_5',
    cols_2023=['SQ9_1', 'SQ9_1_m2', 'SQ9_1_m3', 'SQ9_1_m4', 'SQ9_1_m5', 'SQ9_1_m6'],
    target_value=5,
    convert_value=1
)

### (14) child_age_over19

In [22]:
df_user = add_child_age_column(
    df_user,
    df_2022, df_2023,
    new_col_name='child_age_over19',
    col_2022='SQ9_1_6',
    cols_2023=['SQ9_1', 'SQ9_1_m2', 'SQ9_1_m3', 'SQ9_1_m4', 'SQ9_1_m5', 'SQ9_1_m6'],
    target_value=6,
    convert_value=1
)

In [26]:
# 최종 DB 업로드
df_user.to_sql(name="user", con=engine, index=False, if_exists="replace")
print("user 테이블 최종 업데이트 완료!")

user 테이블 최종 업데이트 완료!
