In [None]:
# 라이브러리
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
from collections import defaultdict

from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
from google.colab import drive
drive.mount('/content/drive')

print(os.getcwd())

os.chdir('/content/drive/MyDrive/Colab Notebooks/추천')
print(os.getcwd())

Mounted at /content/drive
/content
/content/drive/MyDrive/Colab Notebooks/추천


In [None]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [None]:
apply_train_df = pd.read_csv('./open/apply_train.csv')

In [None]:
apply_train_df

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112
...,...,...
57941,U02270,R03430
57942,U02640,R04987
57943,U08238,R01342
57944,U01296,R06363


In [None]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train_df.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [None]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train_df.copy()
print(train.shape, val.shape, pred.shape)

(49464, 2) (8482, 2) (57946, 2)


In [None]:
# 사용자-아이템 행렬 생성: 구직자가 해당 채용 공고에 지원했으면 1, 아니면 0으로 설정
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
train_user_item_matrix[train_user_item_matrix > 1] = 1
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix[pred_user_item_matrix > 1] = 1

In [None]:
# 사용자 간의 유사성 계산
train_user_similarity = cosine_similarity(train_user_item_matrix)
train_item_similarity = cosine_similarity(train_user_item_matrix.T)

pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

In [None]:
train_item_similarity.shape, train_user_similarity.shape

((6691, 6691), (8482, 8482))

In [None]:
# 추천 점수 계산
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [None]:
train_user_item_matrix

recruitment_seq,R00001,R00002,R00003,R00004,R00005,R00006,R00007,R00008,R00009,R00010,...,R06686,R06687,R06688,R06689,R06690,R06691,R06692,R06693,R06694,R06695
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00004,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U08479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U08480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U08481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# 이미 지원한 채용 공고 제외하고 추천
alpha = 0.6
train_recommendations = []
for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
    # 해당 사용자가 지원한 채용 공고
    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)
    # print(idx, user)

    # 해당 사용자의 추천 점수 ( 높은 점수 부터 정렬 )
    sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        train_recommendations.append([user, job])


0it [00:00, ?it/s]

In [None]:
val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])
val_prediction

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R03811
1,U00001,R05862
2,U00001,R03777
3,U00001,R04769
4,U00001,R03037
...,...,...
42405,U08482,R04602
42406,U08482,R00473
42407,U08482,R04021
42408,U08482,R05461


In [None]:
recall5(val,val_prediction)

0.12886111766092903

In [None]:
## grid search
# alpha 값의 후보 범위 지정
alpha_values = np.linspace(0, 1, 21)  # 0부터 1까지 0.1 간격으로

best_recall = -1
best_alpha = None

for alpha in tqdm(alpha_values):
    train_recommendations = []

    for idx, user in enumerate(train_user_item_matrix.index):
        # 해당 사용자가 지원한 채용 공고
        applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

        # 해당 사용자의 추천 점수 ( 높은 점수 부터 정렬 )
        sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
        recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

        for job in recommended_jobs:
            train_recommendations.append([user, job])

    # 예측 결과를 DataFrame으로 변환
    val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

    # recall 계산
    recall = recall5(val, val_prediction)
    print(f"alpha: {alpha}, recall: {recall}")

    # best_alpha와 best_recall 업데이트
    if recall > best_recall:
        best_recall = recall
        best_alpha = alpha

print(f"Best alpha: {best_alpha}, Best recall: {best_recall}")


  0%|          | 0/21 [00:00<?, ?it/s]

alpha: 0.0, recall: 0.12473473237444
alpha: 0.05, recall: 0.12520631926432446
alpha: 0.1, recall: 0.1256779061542089
alpha: 0.15000000000000002, recall: 0.1269747701013912
alpha: 0.2, recall: 0.12721056354633342
alpha: 0.25, recall: 0.12744635699127566
alpha: 0.30000000000000004, recall: 0.1279179438811601
alpha: 0.35000000000000003, recall: 0.1280358406036312
alpha: 0.4, recall: 0.12768215043621786
alpha: 0.45, recall: 0.12780004715868898
alpha: 0.5, recall: 0.12780004715868898
alpha: 0.55, recall: 0.12838953077104456
alpha: 0.6000000000000001, recall: 0.12886111766092903
alpha: 0.65, recall: 0.12815373732610233
alpha: 0.7000000000000001, recall: 0.1286253242159868
alpha: 0.75, recall: 0.12838953077104456
alpha: 0.8, recall: 0.12827163404857345
alpha: 0.8500000000000001, recall: 0.1279179438811601
alpha: 0.9, recall: 0.12827163404857345
alpha: 0.9500000000000001, recall: 0.12827163404857345
alpha: 1.0, recall: 0.12874322093845791
Best alpha: 0.6000000000000001, Best recall: 0.12886111

In [None]:
# pred
alpha = 0.6
pred_recommendations = []
for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * alpha + pred_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        pred_recommendations.append([user, job])

0it [00:00, ?it/s]

In [None]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R01528
1,U00001,R03811
2,U00001,R06276
3,U00001,R00165
4,U00001,R03037
...,...,...
42405,U08482,R04602
42406,U08482,R00473
42407,U08482,R00712
42408,U08482,R04021


In [None]:
top_recommendations.to_csv('./baseline_submit.csv', index=False)

In [None]:
# 0.6 = 0.1288
# 0.7
# 0.63
# 0.65