In [61]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from collections import Counter
import seaborn as sns
from scipy import sparse

In [3]:
course_chapter_items_df = pd.read_csv('./data/course_chapter_items.csv')
course_df = pd.read_csv('./data/courses.csv')
users_df = pd.read_csv('./data/users.csv')
subgroups_df = pd.read_csv('./data/subgroups.csv')

train_group_df = pd.read_csv('./data/train_group.csv')
test_seen_group_df = pd.read_csv('./data/test_seen_group.csv')
val_seen_group_df = pd.read_csv('./data/val_seen_group.csv')
test_unseen_group_df = pd.read_csv('./data/test_unseen_group.csv')
val_unseen_group_df = pd.read_csv('./data/val_unseen_group.csv')

train_df = pd.read_csv('./data/train.csv')
test_seen_df = pd.read_csv('./data/test_seen.csv')
val_seen_df = pd.read_csv('./data/val_seen.csv')
test_unseen_df = pd.read_csv('./data/test_unseen.csv')
val_unseen_df = pd.read_csv('./data/val_unseen.csv')

In [4]:
def check_row_number_with_unique_amount(df, df_name, column):
    nrow = df.shape[0]
    nunique = pd.unique(df.loc[:,column]).shape[0]
    msg = f'dataframe {df_name} row number != unique {column} amount'
    assert nrow == nunique, msg
    print(f'dataframe {df_name} row number == unique {column} amount ({nrow} == {nunique})')

def check_unique_items_are_same_set(df1, df1_name, df1_column, df2, df2_name, df2_column):
    df1_set = set(df1.loc[:,df1_column].to_list())
    df2_set = set(df2.loc[:,df2_column].to_list())
    diff = df1_set.symmetric_difference(df2_set)
    msg = f'unique {df1_name}.{df1_column} & unique {df2_name}.{df2_column} are different'
    assert len(diff) == 0, msg
    print(f'unique {df1_name}.{df1_column} & unique {df2_name}.{df2_column} are the same')


## Use LightFM model dataset

### build json file in format("user_id", "subgroup")

In [146]:
# users_df -> user_id, interest
# train_group -> user_id, subgroup

##### build {user_id : interest}

In [147]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130566 entries, 0 to 130565
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   user_id            130566 non-null  object
 1   gender             85371 non-null   object
 2   occupation_titles  29056 non-null   object
 3   interests          82756 non-null   object
 4   recreation_names   31935 non-null   object
dtypes: object(5)
memory usage: 5.0+ MB


In [5]:
subgroups2idx = {}
for (id, name) in zip(subgroups_df["subgroup_id"], subgroups_df["subgroup_name"]):
    subgroups2idx.update({name: id})

subgroups2idx

{'更多生活品味': 1,
 '護膚保養與化妝': 2,
 '平面設計': 3,
 '繪畫與插畫': 4,
 '電腦繪圖': 5,
 '應用設計': 6,
 '求職': 7,
 '英文': 8,
 '手作小物': 9,
 'DJ': 10,
 '更多音樂': 11,
 '更多藝術': 12,
 '烹飪料理與甜點': 13,
 '壓力舒緩': 14,
 '運動': 15,
 '親子教育': 16,
 '手工印刷': 17,
 '手工書': 18,
 '動態攝影': 19,
 '素描': 20,
 '樂器': 21,
 '色彩學': 22,
 '字體設計': 23,
 '手寫字': 24,
 '動態設計': 25,
 '音樂理論': 26,
 '刺繡': 27,
 '日文': 28,
 '心靈成長與教育': 29,
 '音樂創作': 30,
 '氣球': 31,
 '程式入門': 32,
 '程式語言': 33,
 '網頁前端': 34,
 '文書處理': 35,
 '角色設計': 36,
 '資料彙整': 37,
 '介面設計': 38,
 '網頁設計': 39,
 '商業攝影': 40,
 '網站架設': 41,
 '更多程式': 42,
 '程式思維': 43,
 '手機程式開發': 44,
 '韓文': 45,
 '更多手作': 46,
 '更多語言': 47,
 '歐洲語言': 48,
 '人聲': 49,
 '個人品牌經營': 50,
 '更多職場技能': 51,
 '網頁後端': 52,
 '後製剪輯': 53,
 '產品設計': 54,
 '靈性發展': 55,
 '影視創作': 56,
 '資料科學': 57,
 '軟體程式開發與維護': 58,
 '職場溝通': 59,
 '表演藝術': 60,
 '創業': 61,
 '文學': 62,
 '區塊鏈': 63,
 '金融商品': 64,
 '文案': 65,
 '數位行銷': 66,
 '設計理論': 67,
 'AI 人工智慧': 68,
 '投資觀念': 69,
 '理財': 70,
 '社會科學': 71,
 '社群行銷': 72,
 '影像創作': 73,
 '遊戲開發': 74,
 '程式理財': 75,
 '量化交易': 76,
 '更多設計': 77,
 '獨立接案': 78,
 '寵

In [6]:
df = users_df.fillna("None")
interest_df = df["interests"]
userid_df = df["user_id"]

users_list = []
interest_list = []

for id, interest in zip(userid_df, interest_df):
    users_list.append(id)
    interest_list.extend(interest.split(', '))

In [7]:
user_info = dict(zip(users_list, interest_list))

In [153]:
user_info["5f5397b640c5be2025738189"]

'None'

In [8]:
investment = [69, 70, 75, 76, 87]
crypto_coin = [63, 64, 66]
info = []
a = {}
for key in user_info.keys():
    
    if user_info[key] == "None" or user_info[key] == 0:
        a[key] = [0]
        info.append(a)
        continue
    
    interests = user_info[key].split(",")
    name_list = []
    for interest in interests:
        interest_name = interest.split("_")[1]
        if interest_name == "篆刻":
            continue
        if interest_name == "西班牙文":
            name_list.append(47)
            name_list.append(48)
            continue
        if interest_name == "更多投資理財":
            # name_list.append(id for id in investment)
            name_list.append(69)
            name_list.append(70)
            name_list.append(75)
            name_list.append(76)
            name_list.append(87)
            name_list.append(81)
            continue
        if interest_name == "比特幣":
            name_list.append(69)
            name_list.append(70)
            name_list.append(75)
            name_list.append(76)
            name_list.append(63)
            name_list.append(64)
            name_list.append(66)
            # name_list.append(id for id in investment)
            # name_list.append(id for id in crypto_coin)
            continue
        name_list.append(subgroups2idx[interest_name])
    # a.update({key:name for name in name_list})
    # break
    # a["user_id"] = key
    # a["interests"] = name_list
    a[key] = name_list
    info.append(a)
    # user_info[key] = name_list   

In [9]:
df = pd.DataFrame(list(info[0].items()), columns=["user_id", "interests"])

In [10]:
pd.melt(frame=df.head().set_index('user_id')['interests'].apply(pd.Series).reset_index(), id_vars=['user_id'], value_name='interests').dropna().drop(['variable'], axis=1)[:5]

Unnamed: 0,user_id,interests
0,54ccaa73a784960a00948687,61.0
1,54dca4456d7d350900e86bae,25.0
2,54e421bac5c9c00900cd8d47,3.0
3,54e961d4c5c9c00900cd8d84,70.0
4,54e9b744c5c9c00900cd8d8a,34.0


In [259]:
train_df = train_group_df.fillna("0")
train_id = train_df["user_id"].to_list()
train_subgroup = train_df["subgroup"].to_list()
train_info = {}
for id, subgroup in zip(train_id, train_subgroup):
    train_info[id] = subgroup.split(' ')

In [252]:
train_group_df["user_id"].isnull().sum()

0

In [254]:
train_df = pd.DataFrame(list(train_info.items()), columns=["user_id", "subgroup"])

In [238]:
train_df

Unnamed: 0,user_id,subgroup
0,5bdecbfffec014002166796a,[27]
1,5fedf958af850a915c86362c,"[1, 7, 19, 29, 36, 49, 50, 51, 59, 61, 63, 64,..."
2,5fd255c43136a460c6f3f930,"[8, 28]"
3,5a0bde2aa15b3f001e98429a,"[1, 59, 60, 71, 79]"
4,5fedf8132a0eb0bfab27882b,[89]
...,...,...
59732,6114c700aa04ce00067265a5,"[59, 60]"
59733,60e66f29be3e3b0006c4db75,"[1, 16]"
59734,60e2b05ee742c300072ff5b3,"[50, 51, 66, 72]"
59735,60da0995947dfc0fb61ef296,[69]


In [13]:

import random
id_interest = {}
for id in train_df["user_id"]:
    user_interest = info[0][id]
    # print(user_interest)
    if user_interest == 0 or user_interest == []:
        # print("noo")
        id_interest[id] = [0]
    else:
        # print(user_interest)
        id_interest[id] = random.sample(set(user_interest), 1)
        # print(aaa)

since Python 3.9 and will be removed in a subsequent version.
  id_interest[id] = random.sample(set(user_interest), 1)


In [255]:
train_data = pd.melt(frame=train_df.set_index('user_id')['subgroup'].apply(pd.Series).reset_index(), id_vars=['user_id'], value_name='subgroup').dropna().drop(['variable'], axis=1)

In [240]:
train_data

Unnamed: 0,user_id,subgroup
0,5bdecbfffec014002166796a,27
1,5fedf958af850a915c86362c,1
2,5fd255c43136a460c6f3f930,8
3,5a0bde2aa15b3f001e98429a,1
4,5fedf8132a0eb0bfab27882b,89
...,...,...
2941307,57de1dced5766a0700c6c4d0,85
2987675,5f5312238717f4a78f3a379d,88
2990084,5c587cf379f1990022dfe2ca,90
3001044,57de1dced5766a0700c6c4d0,90


In [256]:
import pickle
with open("train_user_subgroup_counts.pkl", "rb") as f:
    obj = pickle.load(f)

In [56]:
def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [257]:
train_data

Unnamed: 0,user_id,subgroup
0,5bdecbfffec014002166796a,27
1,5fedf958af850a915c86362c,1
2,5fd255c43136a460c6f3f930,8
3,5a0bde2aa15b3f001e98429a,1
4,5fedf8132a0eb0bfab27882b,89
...,...,...
2941307,57de1dced5766a0700c6c4d0,85
2987675,5f5312238717f4a78f3a379d,88
2990084,5c587cf379f1990022dfe2ca,90
3001044,57de1dced5766a0700c6c4d0,90


In [258]:
subgroup_count = []
for (id, group_id) in zip(train_data['user_id'], train_data['subgroup']):
    subgroup_count.append(obj[id][int(group_id)])
    
train_data["subgroup_count"] = subgroup_count

KeyError: 8

In [38]:
train_data

Unnamed: 0,user_id,subgroup,subgroup_count
0,5bdecbfffec014002166796a,27,1
1,5fedf958af850a915c86362c,1,1
2,5fd255c43136a460c6f3f930,8,1
3,5a0bde2aa15b3f001e98429a,1,1
4,5fedf8132a0eb0bfab27882b,89,1
...,...,...,...
2941307,57de1dced5766a0700c6c4d0,85,2
2987675,5f5312238717f4a78f3a379d,88,1
2990084,5c587cf379f1990022dfe2ca,90,1
3001044,57de1dced5766a0700c6c4d0,90,1


### Interation of users and soubgroups

In [63]:
interation = create_interaction_matrix(train_data, "user_id", "subgroup", "subgroup_count")

x = sparse.csr_matrix(interation.values)
x

<59737x92 sparse matrix of type '<class 'numpy.float64'>'
	with 235302 stored elements in Compressed Sparse Row format>

In [276]:
train_data = pd.read_csv("train_new.csv")

In [104]:
train_data

Unnamed: 0,user_id,subgroup,subgroup_count,gender
0,5bdecbfffec014002166796a,27,1,0
1,5fedf958af850a915c86362c,1,1,0
2,5fd255c43136a460c6f3f930,8,1,1
3,5a0bde2aa15b3f001e98429a,1,1,0
4,5fedf8132a0eb0bfab27882b,89,1,1
...,...,...,...,...
235297,57de1dced5766a0700c6c4d0,85,2,1
235298,5f5312238717f4a78f3a379d,88,1,2
235299,5c587cf379f1990022dfe2ca,90,1,2
235300,57de1dced5766a0700c6c4d0,90,1,1


In [277]:
import json
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(
    users=(x for x in train_data['user_id']),
    items=(str(x) for x in train_data['subgroup']),
)

In [279]:
dataset.fit_partial(
    items=(x for x in train_data['subgroup']),
    item_features = (x for x in train_data['subgroup_count'])
)

In [122]:
dataset.fit_partial(
    users=(x for x in train_data['user_id']),
    user_features=(str(x) for x in train_data['gender'])
)

In [280]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 59737, num_items 184.


In [210]:
(training_data, weights) = dataset.build_interactions(((x, y) for x, y in zip(train_data['user_id'], train_data['subgroup'])))

print(repr(training_data))
print(repr(weights))

<59737x184 sparse matrix of type '<class 'numpy.int32'>'
	with 235302 stored elements in COOrdinate format>
<59737x184 sparse matrix of type '<class 'numpy.float32'>'
	with 235302 stored elements in COOrdinate format>


In [123]:
user_features = dataset.build_user_features(((x, str(y)) for x, y in zip(train_data['user_id'], train_data['gender']))) 
print(repr(user_features))

<59737x59743 sparse matrix of type '<class 'numpy.float32'>'
	with 119474 stored elements in Compressed Sparse Row format>


In [121]:
item_features = dataset.build_item_features(((x, str(y)) for x, y in zip(train_data['subgroup'], train_data['subgroup_count'])))
print(repr(item_features))

<184x184 sparse matrix of type '<class 'numpy.float32'>'
	with 595 stored elements in Compressed Sparse Row format>


In [211]:
from lightfm import LightFM

model = LightFM(loss='bpr', random_state=6666, learning_schedule='adagrad', learning_rate=1e-3)
# model = LightFM(loss='warp', random_state=6666, learning_schedule='adagrad', learning_rate=1e-3)
# model.fit(training_data, epochs=100, num_threads=1, item_features=item_features, user_features=user_features, sample_weight=weights)
model.fit(training_data, epochs=100, num_threads=1, item_features=item_features, user_features=None, sample_weight=weights)
# model.fit(x, epochs=100, num_threads=1, item_features=item_features, user_features=user_features)

<lightfm.lightfm.LightFM at 0x22117e92ee0>

### Validation

In [284]:
val_seen_group_df = pd.read_csv("./data/val_seen_group.csv")

In [285]:
val_seen_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7748 entries, 0 to 7747
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   7748 non-null   object
 1   subgroup  7715 non-null   object
dtypes: object(2)
memory usage: 121.2+ KB


In [301]:
val_id = val_seen_group_df["user_id"].to_list()
val_subgroup = val_seen_group_df["subgroup"].to_list()
val_seen_info = {}
for id, subgroup in zip(val_id, val_subgroup):
    val_seen_info[id] = str(subgroup).split(' ')


In [302]:
val_seen_daf = pd.DataFrame(list(val_seen_info.items()), columns=["user_id", "subgroup"])

In [304]:
valid_data = pd.melt(frame=val_seen_daf.set_index('user_id')['subgroup'].apply(pd.Series).reset_index(), id_vars=['user_id'], value_name='subgroup').dropna().drop(['variable'], axis=1)

In [305]:
valid_data

Unnamed: 0,user_id,subgroup
0,56dae2b74e3ef90900b7bd0e,37
1,60e66f29be3e3b0006c4db75,1
2,5c919efb728ddf00208b9b2b,1
3,5ac115507997a2001e7c3617,33
4,5f53b84440c5be3bb873a9d3,30
...,...,...
203082,60c30f6fbbd2121830ba4340,86
209590,5a11254d516eb50051f99473,91
210192,608a117c7171ccdc19e9dbf0,89
210830,60c30f6fbbd2121830ba4340,90


#### val_seen dataset bulid

In [307]:
valseen_dataset = Dataset()
valseen_dataset.fit((x for x in valid_data['user_id']),
            (x for x in valid_data['subgroup']))

In [308]:
(val_data, weights) = valseen_dataset.build_interactions(((x, y) for x, y in zip(valid_data['user_id'], valid_data['subgroup'])))

print(repr(val_data))

<7748x92 sparse matrix of type '<class 'numpy.int32'>'
	with 28533 stored elements in COOrdinate format>


In [309]:
num_users, num_items = valseen_dataset.interactions_shape()
num_users, num_items

(7748, 92)

In [321]:
def sample_recommendation(model, interactions, user_ids):
     
    recommend_list = []
    for user_id in user_ids:
        # scores = pd.Series(model.predict(user_id,np.arange(92)))
        # scores.index = interactions.columns
        # scores = list(pd.Series(scores.sort_values(ascending=False).index))
        
        # known_items = list(pd.Series(interactions.loc[user_id,:] \
        #                             [interactions.loc[user_id,:] > 0].index) \
        #                             .sort_values(ascending=False))
        
        # scores = [x for x in scores if x not in known_items]
        # scores = [x for x in scores]
        # return_score_list = scores[0:50]
        # known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
        # scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
        
        # scores = pd.Series(model.predict(user_id, np.arange(92)))   # 91個subgroup
        # print(scores)
        # scores = list(pd.Series(scores.sort_values(ascending=False).index))
        scores = model.predict(user_id, np.arange(92))
        top_items = np.argsort(-scores)
        # top_items = subgroups_df["subgroup_id"][np.argsort(-scores)]
        
        # print("User %s" % user_id)

        # print("     Recommended:")
            
        # recommend_list.append([str(x) for x in return_score_list])
        recommend_list.append([str(x+1) for x in top_items[:50]])
        
    return recommend_list

In [322]:
user_id = val_seen_group_df["user_id"].to_dict()

users_list =[]
for i in range(len(user_id)):
    users_list.append(i)

In [323]:
recommend_list = sample_recommendation(model, val_data, users_list)  # test_seen_daf['user_id'].tolist()

In [324]:
import csv
with open("validation.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id", "subgroup"])
    for i in range(len(recommend_list)):
        recommend = " ".join(recommend_list[i])
        writer.writerow([user_id[i], recommend])

### Prediction

In [157]:
test_seen_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7205 entries, 0 to 7204
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   7205 non-null   object
 1   subgroup  7205 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 112.7+ KB


In [158]:
test_id = test_seen_group_df["user_id"].to_list()
test_subgroup = test_seen_group_df["subgroup"].to_list()
test_seen_info = {}
for id, subgroup in zip(test_id, test_subgroup):
    test_seen_info[id] = subgroup


In [159]:
test_seen_daf = pd.DataFrame(list(test_seen_info.items()), columns=["user_id", "subgroup"])

In [139]:
test_seen_daf

Unnamed: 0,user_id,subgroup
0,5c6e7a8d6d180f002084a746,1
1,5e11926d54511e0d1440d024,1
2,59a1bbeb3ba5a507005d94bf,1
3,5a11254d516eb50051f99473,1
4,56702e0f13ef621200899d90,1
...,...,...
7200,5a181ca3152204001e1bf92b,1
7201,59c4b97840fc9f001e640b09,1
7202,5f998423f92f815dbbeb9130,1
7203,5a115497516eb50051f99539,1


#### test_seen dataset bulid

In [161]:
testseen_dataset = Dataset()
testseen_dataset.fit((x for x in test_seen_daf['user_id']),
            (x for x in test_seen_daf['subgroup']))
# testseen_dataset.fit((x for x in test_seen_daf['user_id']))

In [151]:
(test_data, weights) = testseen_dataset.build_interactions(((x, y) for x, y in zip(test_seen_daf['user_id'], test_seen_daf['subgroup'])))

print(repr(test_data))

<7205x1 sparse matrix of type '<class 'numpy.int32'>'
	with 7205 stored elements in COOrdinate format>


In [162]:
def sample_recommendation(model, data, user_ids, user_id):
     
    recommend_list = []
    for user_id in user_ids:
        # known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
        
        scores = model.predict(user_id, np.arange(91))   # 91個subgroup
        # print(scores)
        top_items = subgroups_df["subgroup_id"][np.argsort(-scores)]
        
        # print("User %s" % user_id)

        # print("     Recommended:")
            
        recommend_list.append([str(x) for x in top_items[:40]])
        
    return recommend_list

In [163]:
user_id = test_seen_group_df["user_id"].to_dict()

users_list =[]
for i in range(len(user_id)):
    users_list.append(i)

In [154]:
recommend_list = sample_recommendation(model, test_data, users_list, user_id)  # test_seen_daf['user_id'].tolist()

  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_list.append([str(x) for x in top_items[:40]])
  recommend_li

In [155]:
import csv
with open("predict.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id", "subgroup"])
    for i in range(len(recommend_list)):
        recommend = " ".join(recommend_list[i])
        writer.writerow([user_id[i], recommend])