In [None]:
%pip install lightfm



In [None]:
import numpy as np
import pandas as pd
from scipy.io import mmwrite
import os

# all lightfm imports
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from hyperopt import fmin, hp, tpe, Trials
from sklearn.metrics.pairwise import cosine_similarity

# imports re for text cleaning
import re
from datetime import datetime, timedelta

# we will ignore pandas warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.getcwd()

'/content'

In [None]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/추천')
print(os.getcwd())

/content/drive/MyDrive/Colab Notebooks/추천


In [None]:
def recall5(answer_df, submission_df):    ## 안 바뀐 평가식
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)

    return recall

In [None]:
def recall5_(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [None]:
def test_recall_at_5(model, test, item_features):
    """
    Calculate recall@5 for given model and test data.

    Parameters:
    - model: Fitted LightFM model
    - test: Test dataset in COO format

    Returns:
    - recall: Recall@5 value
    """

    # Convert test dataset to DataFrame
    test_df = pd.DataFrame(np.vstack((test.row, test.col)).T, columns=['user_id', 'item_id'])

    # Get top 5 predictions for each user
    top_5_preds = {}
    for user_id in set(test.row):

        # item_features 범위 내에서만 아이템 ID 생성
        # global item_ids, user_idsv
        item_ids = np.arange(item_features.shape[0]) # 변경된 부분
        user_ids = np.full_like(item_ids, user_id)

        scores = model.predict(user_ids, item_ids, item_features=item_features)
        top_items = np.argsort(-scores)[:5]
        top_5_preds[user_id] = top_items

    # Convert to DataFrame for compatibility with recall5 function
    submission_df = pd.DataFrame([(user_id, item_id) for user_id, items in top_5_preds.items() for item_id in items],
                                 columns=['user_id', 'item_id'])

    recall = recall5(test_df, submission_df)
    return recall


In [None]:
# 데이터 로드
df = pd.read_csv('./open/apply_train.csv')

# 사용자와 아이템 목록 추출
users = df['resume_seq'].unique()
items = df['recruitment_seq'].unique()

# 사용자-아이템 상호작용 데이터 생성
user_item_pairs = list(zip(df['resume_seq'], df['recruitment_seq']))

In [None]:
#학습, 검증 분리
train_, val = [], []
apply_train_groupby = df.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train_.append([uid,iid])
    val.append([uid, iids[-1]])


val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
print(val.shape)

(8482, 2)


In [None]:
# pre-processing

item_meta = pd.read_csv('./open/firm_df.csv')

# Data Load
# ratings_source: build_interactions 재료, list of tuples
# --> [(user1, item1), (user2, item5), ... ]
# item_features_source: build_item_features 재료
# --> [(item1, [feature, feature, ...]), (item2, [feature, feature, ...])]

source = [(df['resume_seq'][i], df['recruitment_seq'][i]) for i in range(len(df))]

# item_features_source
item_meta = item_meta[['recruitment_seq', 'address_seq1', 'address_seq2', 'address_seq3',
       'career_end', 'career_start', 'check_box_keyword', 'education',
       'major_task', 'qualifications', 'company_type_seq',
       'supply_kind', 'employee']]

item_features_source = [(item_meta['recruitment_seq'][i],
                         [item_meta['address_seq1'][i],
                          item_meta['address_seq2'][i],
                          item_meta['address_seq3'][i],
                          item_meta['career_end'][i],
                          item_meta['career_start'][i],
                          item_meta['check_box_keyword'][i],
                          item_meta['education'][i],
                          item_meta['major_task'][i],
                          item_meta['qualifications'][i],
                          item_meta['company_type_seq'][i],
                          item_meta['supply_kind'][i],
                          item_meta['employee'][i]])
                          for i in range(len(item_meta))]

In [None]:
# dataset
dataset = Dataset()
dataset.fit(users=users,
            items=items,
            user_features=None,
            item_features=item_meta[item_meta.columns[1:]].values.flatten())

interactions, weights = dataset.build_interactions(source)
item_features = dataset.build_item_features(item_features_source)

In [None]:
# Save
mmwrite('./data/interactions.mtx', interactions)
mmwrite('data/item_features.mtx', item_features)
mmwrite('data/weights.mtx', weights)

In [None]:
# Split Train, test Data
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=1227)
train, test = train.tocsr().tocoo(), test.tocsr().tocoo()
train_weights = train.multiply(weights).tocoo()


In [None]:
train.shape

(8482, 6695)

In [None]:
# Define Search Space
trials = Trials()
space = [hp.choice('no_components', range(10, 100, 10)),
         hp.uniform('learning_rate', 0.01, 0.05),
         hp.loguniform('item_alpha', np.log(1e-5), np.log(1e-1)),  # item_alpha 추가
         hp.loguniform('user_alpha', np.log(1e-5), np.log(1e-1)),   # user_alpha 추가
         hp.choice('epochs', [10, 20, 30, 40, 50])  # epochs 추가
         ]

In [None]:
# Define Objective Function
def objective(params):
    no_components, learning_rate, item_alpha, user_alpha, epochs = params

    model = LightFM(no_components=no_components,
                    learning_schedule='adagrad',
                    loss='warp',
                    learning_rate=learning_rate,
                    item_alpha=item_alpha,
                    user_alpha=user_alpha,
                    random_state=1227,)

    model.fit(interactions=train,
              item_features=item_features,
              sample_weight=train_weights,
              epochs=1,
              verbose=False)



    test_recall = test_recall_at_5(model, test, item_features)
    print("no_comp: {}, lrn_rate: {:.4f}, item_alpha: {:.4f}, user_alpha: {:.4f}, epochs: {}, recall: {:.4f}".format(
        no_components, learning_rate, item_alpha, user_alpha, epochs, test_recall))
    output = output = -test_recall

    if np.abs(output+1) < 0.01 or output < -1.0:
        output = 0.0

    return output

In [None]:
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)  # 한 에포크의 한 2분?

no_comp: 10, lrn_rate: 0.0275, item_alpha: 0.0000, user_alpha: 0.0000, epochs: 50, recall: 0.0039
no_comp: 80, lrn_rate: 0.0456, item_alpha: 0.0030, user_alpha: 0.0047, epochs: 30, recall: 0.0035
no_comp: 60, lrn_rate: 0.0372, item_alpha: 0.0000, user_alpha: 0.0033, epochs: 10, recall: 0.0030
no_comp: 60, lrn_rate: 0.0203, item_alpha: 0.0001, user_alpha: 0.0000, epochs: 30, recall: 0.0030
no_comp: 50, lrn_rate: 0.0397, item_alpha: 0.0000, user_alpha: 0.0000, epochs: 20, recall: 0.0034
no_comp: 30, lrn_rate: 0.0235, item_alpha: 0.0060, user_alpha: 0.0002, epochs: 50, recall: 0.0020
no_comp: 90, lrn_rate: 0.0140, item_alpha: 0.0000, user_alpha: 0.0000, epochs: 30, recall: 0.0035
no_comp: 40, lrn_rate: 0.0392, item_alpha: 0.0010, user_alpha: 0.0000, epochs: 30, recall: 0.0031
no_comp: 40, lrn_rate: 0.0313, item_alpha: 0.0003, user_alpha: 0.0000, epochs: 10, recall: 0.0039
no_comp: 60, lrn_rate: 0.0293, item_alpha: 0.0002, user_alpha: 0.0004, epochs: 50, recall: 0.0033
100%|██████████| 10/

In [None]:
best_params

{'epochs': 4,
 'item_alpha': 9.442065807792226e-06,
 'learning_rate': 0.027475971524100723,
 'no_components': 0,
 'user_alpha': 2.898109407944226e-05}

In [None]:
no_components = best_params["no_components"]
learning_rate = best_params["learning_rate"]
item_alpha = best_params["item_alpha"]
user_alpha = best_params["user_alpha"]
epochs = best_params["epochs"]


In [None]:
model = LightFM(no_components=10,
                    learning_schedule='adagrad',
                    loss='warp',
                    learning_rate=learning_rate,
                    item_alpha=item_alpha,
                    user_alpha=user_alpha,
                    random_state=1227,)

model.fit(interactions=train,
              item_features=item_features,
              sample_weight=train_weights,
              epochs=epochs,
              verbose=False)

<lightfm.lightfm.LightFM at 0x7fa83e6129e0>

In [None]:
# Find Similar Items
# 행렬 생성
item_biases, item_embeddings = model.get_item_representations(features=item_features) # (6695, ), (6695, 2)

print(item_biases.shape)
print()
print(item_embeddings.shape)

(6695,)

(6695, 10)


In [None]:
item_embeddings

array([[ 4.44513336e-02, -2.18429789e-03,  4.49828198e-03, ...,
         6.86375843e-03, -9.35684610e-03,  5.36422711e-03],
       [ 5.56468666e-02, -1.35462005e-02,  1.00545064e-02, ...,
        -1.44478353e-03, -5.90855628e-03,  4.22152807e-05],
       [-3.24550450e-01,  3.13721687e-01, -1.77732483e-01, ...,
         2.43077755e-01,  3.63536149e-01,  2.37157315e-01],
       ...,
       [-4.01908197e-02,  4.44677211e-02, -7.75670707e-02, ...,
         1.34639936e-02, -2.44458765e-02,  7.80653358e-02],
       [ 6.19975105e-02, -3.54375765e-02,  1.11952871e-01, ...,
        -7.81229362e-02,  1.27100330e-02, -6.23330809e-02],
       [-9.54180509e-02,  1.01831555e-01, -1.56627104e-01, ...,
        -1.03801668e-01, -6.39676079e-02,  1.72569498e-01]], dtype=float32)

In [None]:
user_encoder = LabelEncoder()
df['user_ids'] = user_encoder.fit_transform(df['resume_seq']) + 1
df['item_ids'] = user_encoder.fit_transform(df['recruitment_seq']) + 1
df

Unnamed: 0,resume_seq,recruitment_seq,user_ids,item_ids
0,U05833,R03838,5833,3838
1,U06456,R02144,6456,2144
2,U07807,R01877,7807,1877
3,U04842,R02463,4842,2463
4,U08336,R00112,8336,112
...,...,...,...,...
57941,U02270,R03430,2270,3430
57942,U02640,R04987,2640,4987
57943,U08238,R01342,8238,1342
57944,U01296,R06363,1296,6363


In [None]:
# 행렬 생성
df_matrix = df.groupby(['resume_seq', 'recruitment_seq']).size().unstack()

In [None]:
# 코사인 데이터 프레임 만들기
x = cosine_similarity(item_embeddings)
consine_ = pd.DataFrame(data=x)
consine_.columns = [str(int(col) + 1) for col in consine_.columns]
consine_.index = consine_.index + 1
consine_

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,6686,6687,6688,6689,6690,6691,6692,6693,6694,6695
1,1.000000,0.964094,-0.192534,-0.159194,-0.123543,0.925403,0.163453,-0.232735,0.236295,0.226368,...,-0.079308,0.581633,0.751693,-0.096248,0.523595,0.932489,0.131190,-0.040300,-0.074443,-0.406138
2,0.964094,1.000000,-0.404049,-0.381954,-0.351300,0.952494,0.398932,-0.440695,0.462794,0.453768,...,-0.306539,0.761623,0.629275,0.151128,0.606102,0.936574,-0.101565,-0.261452,0.127250,-0.551002
3,-0.192534,-0.404049,1.000000,0.996961,0.992265,-0.506423,-0.966157,0.996558,-0.965584,-0.967672,...,0.989807,-0.767177,0.069301,-0.830593,-0.693956,-0.440234,0.805528,0.735557,-0.652308,0.570603
4,-0.159194,-0.381954,0.996961,1.000000,0.997916,-0.467113,-0.980220,0.991233,-0.975315,-0.977499,...,0.994870,-0.778333,0.121662,-0.865080,-0.664896,-0.396334,0.831007,0.765931,-0.675403,0.583803
5,-0.123543,-0.351300,0.992265,0.997916,1.000000,-0.433900,-0.982419,0.985280,-0.970975,-0.973836,...,0.998131,-0.765480,0.139887,-0.888952,-0.622388,-0.363969,0.860283,0.788028,-0.717516,0.584594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6691,0.932489,0.936574,-0.440234,-0.396334,-0.363969,0.993059,0.366589,-0.477364,0.433087,0.426523,...,-0.328533,0.652267,0.716763,0.073178,0.648166,1.000000,-0.091659,-0.191195,0.112181,-0.497411
6692,0.131190,-0.101565,0.805528,0.831007,0.860283,-0.157626,-0.869638,0.800707,-0.797094,-0.801069,...,0.864541,-0.551694,0.314780,-0.874213,-0.151952,-0.091659,1.000000,0.945114,-0.900324,0.666313
6693,-0.040300,-0.261452,0.735557,0.765931,0.788028,-0.267314,-0.845230,0.739129,-0.774955,-0.774449,...,0.776947,-0.640365,0.299308,-0.787060,-0.128585,-0.191195,0.945114,1.000000,-0.809943,0.838090
6694,-0.074443,0.127250,-0.652308,-0.675403,-0.717516,0.147293,0.716326,-0.654278,0.645022,0.651067,...,-0.722522,0.477895,-0.071642,0.820639,0.009207,0.112181,-0.900324,-0.809943,1.000000,-0.513688


In [None]:
# 지원한 공고 제외 데이터 프레임
applied_jobs = df.groupby('user_ids')['item_ids'].apply(list).to_dict()

In [None]:
## 이미 지원한 채용공고 제외하고 추천해야함.  3번  -> 코사인 계산 과정에서 달라진다.  하지만 공고는 달라지지않고 추천만 다랄짐.
def make_best_items_report(item_embeddings, user_id, num_search_items=5):
    # 사용자가 이미 지원한 공고를 가져옵니다.
    applied_jobs = df[df['user_ids'] == user_id]['item_ids'].values.tolist()
    # print(applied_jobs)

    best_items = []
    for item_id in applied_jobs:
        # Cosine similarity
        scores = cosine_similarity([item_embeddings[item_id]], item_embeddings)

        # best: score가 제일 높은 item의 id를 num_search_items 개 만큼 가져온다.
        best = np.argpartition(scores[0], -num_search_items)[-num_search_items:] + 1

        # 사용자가 이미 지원한 공고를 제외합니다.
        best = [item for item in best if item not in applied_jobs]

        best_items += sorted(zip(best, scores[0][best]), key=lambda x: -x[1])  # 목록들이 정렬 됨.

        # 상위 5개의 공고를 반환합니다.
    return best_items[:5]


                                                                  #  이미 지원한 공고에 대해 코사인 유사도를 게산, 그리고 거기서 비슷한것에 상위 5개

print()
report01 = make_best_items_report(item_embeddings, 2, 5)
print(report01)
print()
report02 = make_best_items_report(item_embeddings, 1, 5)
print(report02)


[(5597, 0.46915913), (6323, 0.40776795), (4975, 0.08633523), (878, -0.31195137), (6020, -0.3259887)]

[(3186, 0.9930017), (2528, 0.68410605), (2708, -0.18225953), (6275, -0.9603256), (5289, -0.9633105)]


In [None]:
# submit할 제출 코드 필요.
## 이미 지원한 채용공고 제외하고 추천해야함.

# 사용자 목록을 가져옵니다.
users_number = df['user_ids'].unique()

# 사용자별로 추천된 아이템을 저장할 빈 리스트를 만듭니다.
recommendations = []

# 각 사용자에 대해 추천된 아이템을 구합니다.
for idx, user in enumerate(df_matrix.index):  # idx 와 resume_seq
    # 해당 사용자가 지원한 채용 공고
    user_id = df[df['resume_seq'] == user]["user_ids"].nunique()


    recommended_items = make_best_items_report(item_embeddings, user_id, 5)  # 5개가 나온다.
    for item, score in recommended_items:
        item_id = df[df['item_ids'] == item]["recruitment_seq"]
        recommendations.append((user, item_id.iloc[0]))

# 추천된 아이템을 DataFrame으로 만듭니다.
df_recommendations = pd.DataFrame(recommendations, columns=['resume_seq', 'recruitment_seq'])

# DataFrame을 CSV 파일로 저장합니다.
df_recommendations.to_csv('submission.csv', index=False)


In [None]:
df_recommendations

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R03186
1,U00001,R02528
2,U00001,R02708
3,U00001,R06275
4,U00001,R05289
...,...,...
42405,U08482,R03186
42406,U08482,R02528
42407,U08482,R02708
42408,U08482,R06275


In [None]:
recall5_(val, df_recommendations)

0.0005894836123555765

In [None]:
42410/8482

5.0