In [41]:
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
df = pd.read_csv('../data/genderage.csv', encoding = 'euckr')
df.head()

Unnamed: 0,축제명,연령대,남성비율,여성비율
0,강경젓갈축제,70세 이상,5.4,4.7
1,강경젓갈축제,60~69세,12.6,11.3
2,강경젓갈축제,50~59세,12.5,12.3
3,강경젓갈축제,40~49세,8.4,7.3
4,강경젓갈축제,30~39세,5.6,4.9


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   축제명     688 non-null    object 
 1   연령대     688 non-null    object 
 2   남성비율    688 non-null    float64
 3   여성비율    688 non-null    float64
dtypes: float64(2), object(2)
memory usage: 21.6+ KB


In [44]:
df['연령대 비율'] = df['남성비율'] + df['여성비율']
df_sex = df[['축제명', '남성비율', '여성비율']].groupby(['축제명']).sum()
df = df.drop(['남성비율','여성비율'],axis=1)

In [57]:
df_grouped = df.groupby(['축제명', '연령대'])['연령대 비율'].sum().unstack(fill_value=0)

def categorize_festival(row):
    age_groups = {
        '장년층': row['50~59세'] + row['60~69세'] + row['70세 이상'],
        '가족': row['0~9세'] + row['10~19세'] + row['30~39세'] + row['40~49세'],
        '청년': row['20~29세'] + row['30~39세']}
    
    target = max(age_groups, key=age_groups.get)
    return target

df_grouped['target'] = df_grouped.apply(categorize_festival, axis=1)



In [72]:
df_target = df_grouped[['target']].reset_index()
df_target
df_target.to_csv('../data/연령대별구분.csv', index=False)

In [58]:
df_grouped['target'].value_counts()

target
장년층    51
가족     28
청년      7
Name: count, dtype: int64

In [68]:
df_grouped[['축제명', 'target']]

KeyError: "['축제명'] not in index"

In [48]:
prop_sum = df_sex['남성비율'] + df_sex['여성비율']
df_sex['남성비율'] = df_sex['남성비율'] / prop_sum
df_sex['여성비율'] = df_sex['여성비율'] / prop_sum

df_sex

Unnamed: 0_level_0,남성비율,여성비율
축제명,Unnamed: 1_level_1,Unnamed: 2_level_1
강경젓갈축제,0.518519,0.481481
강릉커피축제,0.483968,0.516032
강진청자축제,0.532533,0.467467
고령대가야축제,0.542458,0.457542
곡성세계장미축제,0.468064,0.531936
...,...,...
한성백제문화제,0.460460,0.539540
함평나비축제,0.511000,0.489000
화성뱃놀이축제,0.598599,0.401401
화천산천어축제,0.572000,0.428000


In [66]:
def categorize_sex(row):
    if row['남성비율'] > 0.55:
        return '남성'
    elif row['여성비율'] > 0.55:
        return '여성'
    else:
        return '중립'

df_sex['target_sex'] = df_sex.apply(categorize_sex, axis=1)

In [67]:
df_sex['target_sex'].value_counts()

target_sex
중립    72
남성    14
Name: count, dtype: int64

In [50]:
df.groupby(['축제명', '연령대'])['연령대 비율'].sum().unstack(fill_value=0)

연령대,0~9세,10~19세,20~29세,30~39세,40~49세,50~59세,60~69세,70세 이상
축제명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
강경젓갈축제,0.8,5.3,8.8,10.5,15.7,24.8,23.9,10.1
강릉커피축제,1.3,7.6,17.8,16.3,17.4,19.6,13.7,6.1
강진청자축제,1.6,6.0,8.6,13.9,20.0,26.4,16.8,6.6
고령대가야축제,1.6,7.7,10.9,13.0,19.2,26.1,15.8,5.8
곡성세계장미축제,0.9,4.9,11.6,13.9,17.3,26.5,18.3,6.8
...,...,...,...,...,...,...,...,...
한성백제문화제,0.6,6.1,20.1,19.8,17.1,17.1,13.6,5.5
함평나비축제,1.7,6.5,9.7,16.0,18.8,23.6,16.6,7.1
화성뱃놀이축제,1.2,4.0,12.8,19.3,20.5,23.1,14.6,4.4
화천산천어축제,1.7,10.2,20.3,15.5,19.5,18.0,10.4,4.4
