In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from collections import Counter


# Mapping subgroup and subgroup_id

In [3]:
subgroups_df = pd.read_csv('./data/subgroups.csv')
subgroup_num = len(subgroups_df)

subgroups2idx = {}
for (id, name) in zip(subgroups_df["subgroup_id"], subgroups_df["subgroup_name"]):
    subgroups2idx.update({name: id})


# Mapping課程和包含的subgroups們

In [11]:
course_df = pd.read_csv('./data/courses.csv')

course2subgroups = {}
for (course_id, sub_groups) in zip(course_df["course_id"], course_df["sub_groups"]):
    if pd.isnull(sub_groups):
        course2subgroups.update({course_id: [0]})
    else:
        course2subgroups.update({course_id: [ subgroups2idx[sub_group] for sub_group in sub_groups.split(',')]})

course2subgroups

{'61888e868f154b000781b45a': [1, 2],
 '54d5a117065a7e0e00725ac0': [3, 4],
 '54d5d9952246e60a009ec571': [3, 5],
 '54d7148a2246e60a009ec588': [3, 6, 5],
 '5513e92b38239d10005778e1': [7, 8],
 '55307d94d530a90a00a3a896': [9],
 '551a6be023774e0a001eb20c': [4],
 '551171a938239d1000577864': [10, 11],
 '54f1268f4ec3c809002e4a29': [12],
 '55599687dfe21b0a00e776d9': [13],
 '559e49185850311000fca504': [14, 1, 15],
 '55a4d6397b4d99100011957d': [13],
 '55ae208a7b4d9910001198f2': [1, 16],
 '55b0ba667b4d991000119a92': [15],
 '557ad62bd736230f00adb2ab': [17],
 '556ad58777a8710900bedffe': [18],
 '55ae36c87b4d99100011990f': [15],
 '55ae34247b4d99100011990d': [15],
 '55ae66017b4d991000119959': [15],
 '5591501e6dec460f00111314': [4],
 '55c636d27b4d99100011a4aa': [19],
 '5593f992cfe8320b00ccd4c4': [6, 20, 4],
 '559a217dcfe8320b00ccd75b': [21],
 '55e3dccbfa223d100058f3cb': [4],
 '5606262ac61d930a00455fad': [4],
 '55e82d8efa223d100058f6f4': [3, 5],
 '55e66f33fa223d100058f5f7': [1],
 '55ea1ea8fa223d100058f7df

# 得到每個user購買過的subgroups總數

In [26]:
train_df = pd.read_csv('./data/train.csv')

user2subgroups_count = {}
for user_id, course_ids in zip(train_df["user_id"], train_df["course_id"]):
    subgroup_count_list = [0 for i in range(subgroup_num + 1)]
    course_id_list = course_ids.split(' ')
    for course_id in course_id_list:
        for subgroup in course2subgroups[course_id]:
            subgroup_count_list[subgroup] += 1

    user2subgroups_count.update({user_id: subgroup_count_list})


In [31]:
def get_none_zero_subgroups_count(subgroups_count):
    none_zero_subgroups_count = {}
    for i in range(len(subgroups_count)):
        if subgroups_count[i] > 0:
            none_zero_subgroups_count.update({i: subgroups_count[i]})
    return none_zero_subgroups_count

In [32]:
get_none_zero_subgroups_count(user2subgroups_count["60c49342b1da9db0f1cb19e4"])

{32: 1, 33: 2, 34: 1, 38: 1, 39: 1, 58: 1}

# 用法

In [33]:
# 取得特定user特定的subgroup購買數
user2subgroups_count["60c49342b1da9db0f1cb19e4"][33]

2