In [20]:
import pandas as pd
import numpy as np
import glob
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity



# 데이터 전처리


In [21]:
directory_path = '/Volumes/MGTEC/(KOPIS) 공모전 데이터 추출/19년 하반기_22년(추출일자 230515)/(신규장르) 19년 하반기_22년(230515)/22년/'
excel_files = glob.glob(directory_path + "*.xlsx")
dataframes = []

In [22]:
for file in excel_files:
    df = pd.read_excel(file, usecols=['공연코드', '성별', '연령', '장르명', '공연지역명', '장당금액'])
    dataframes.append(df)

In [23]:
df = pd.concat(dataframes, ignore_index=True)

In [24]:
df = df[df['장당금액'] > 0]

In [25]:
df = df[df['연령'] != 0]

# 추천 모델 생성

In [26]:
categorical_features = ['성별', '장르명', '공연지역명']
numerical_features = ['연령', '장당금액']

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

In [28]:
class RecommendationModel:
    def __init__(self, preprocessor, data, target_col='공연코드'):
        self.preprocessor = preprocessor
        self.data = data
        self.target_col = target_col
        self.features = self.data.drop(columns=[self.target_col])

    def recommend(self, user_df, top_n=5):
        # 데이터 전처리
        transformed_data = self.preprocessor.fit_transform(self.features)
        transformed_user = self.preprocessor.transform(user_df)

        # 코사인 유사도 계산
        similarities = cosine_similarity(transformed_user, transformed_data)

        # 유사도가 높은 상위 n개의 인덱스 추출
        top_indices = similarities[0].argsort()[-top_n:][::-1]

        # 추천 공연 코드 추출
        recommendations = self.data.iloc[top_indices][self.target_col]
        return recommendations

# 테스트

In [29]:
recommendation_pipeline = RecommendationModel(preprocessor=preprocessor, data=df)

In [30]:
user_input = {
    '성별': [1],
    '연령': [1955],
    '장르명': ['뮤지컬'],
    '공연지역명': ['인천'],
    '장당금액': [12000]
}

# 사용자 입력을 DataFrame으로 변환
user_df = pd.DataFrame(user_input)

In [31]:
top_5_recommendations = recommendation_pipeline.recommend(user_df, top_n=5)
print("추천 공연 코드:", top_5_recommendations.values)

추천 공연 코드: ['PF307212' 'PF300165' 'PF300165' 'PF307212' 'PF307212']


# 모델 저장

In [33]:
from joblib import dump, load


transformed_data = preprocessor.fit_transform(df.drop(columns=['공연코드']))
performance_codes = df['공연코드'].tolist()


dump(preprocessor, '/Users/lne/Downloads/KOPIS/model/preprocessor.joblib')
dump(transformed_data, '/Users/lne/Downloads/KOPIS/model/transformed_data.joblib')
dump(performance_codes, '/Users/lne/Downloads/KOPIS/model/performance_codes.joblib')

['/Users/lne/Downloads/KOPIS/model/performance_codes.joblib']

# 테스트

In [34]:
from joblib import load
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
preprocessor = load('/Users/lne/Downloads/KOPIS/model/preprocessor.joblib')
transformed_data = load('/Users/lne/Downloads/KOPIS/model/transformed_data.joblib')
performance_codes = load('/Users/lne/Downloads/KOPIS/model/performance_codes.joblib')

In [36]:
# 추천 함수 정의
def recommend(user_df, transformed_data, performance_codes, preprocessor, top_n=5):
    # 새 사용자 데이터 전처리
    transformed_user = preprocessor.transform(user_df)
    
    # 코사인 유사도 계산
    similarities = cosine_similarity(transformed_user, transformed_data)

    # 유사도가 높은 상위 n개의 인덱스 추출
    top_indices = similarities[0].argsort()[-top_n:][::-1]

    # 추천 공연 코드 추출
    recommendations = [performance_codes[i] for i in top_indices]
    recommendations = set(list(recommendations))

    return recommendations

In [37]:
user_input = {
    '성별': [1],
    '연령': [1955],
    '장르명': ['뮤지컬'],
    '공연지역명': ['인천'],
    '장당금액': [12000]
}
user_df = pd.DataFrame(user_input)

# 추천 수행
top_5_recommendations = recommend(user_df, transformed_data, performance_codes, preprocessor, top_n=5)
print("추천 공연 코드:", top_5_recommendations)

추천 공연 코드: {'PF307212', 'PF300165'}
