In [89]:
import pandas as pd
import numpy as np
from transformers import pipeline

In [2]:
# user와 sentence로 분리
def split_user_sentence(df):
    '''
    user_df, sentence_df를 반환하는 함수
    '''
    dfa = df.copy()
    col_uniq = dfa.groupby(by='user_id').nunique().sum()
    user_num = dfa.user_id.nunique()
    user_col = col_uniq[col_uniq <= user_num].index.to_list()
    sentence_col = [col for col in dfa.columns if col not in user_col]
    user_df = dfa[['user_id', *(user_col)]]
    sentence_df = dfa[sentence_col]
    return user_df, sentence_df

In [4]:
# sentence에 deny 추가
def add_deny2(df):
    dfa = df.copy()
    pipe = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
    def score(df):
        if (df.is_user == 1):
            star = int(pipe(df.sentence)[0]['label'][0])
            return int(star < 3)
        else:
            return np.NaN
    sent = dfa[['sentence', 'is_user']].apply(score, axis=1)
    dfa['deny'] = sent
    return dfa

In [63]:
# user에 precision, P@K 추가
def add_precision(df_user, df_sentence):
    '''
    df_user, df_sentence를 입력받아 df_user를 반환
    '''
    def precision(df, uid):
        tdf = df.copy()
        tdf = tdf[tdf.user_id == uid]
        prec = 1 - sum(tdf.deny) / tdf.shape[0]
        return prec
    
    def precision_K(df, K, uid):
        tdf = df.copy()
        tdf = tdf[(tdf.user_id == uid) & (tdf.is_user)]
        if tdf.shape[0] <= K:
            prec_k = 1 - sum(tdf.deny) / tdf.shape[0]
        else:
            prec_k = 1 - sum(tdf.iloc[:K].deny) / K
        return prec_k
    
    dfu = df_user.copy()
    #precision
    precisions = {i:precision(df_sentence, i) for i in dfu.user_id}
    dfu['precision'] = pd.Series(precisions)
    #P@K
    for i in range(1,10):
        prec = {j:precision_K(df_sentence, i, j) for j in dfu.user_id}
        dfu[f'precision_{i}'] = pd.Series(prec)
    
    return dfu