In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Import training data and get a sense of it

In [2]:
train_df = pd.read_csv('/Users/amitagarwal/GreyAtom/Hackathon/mckinesy_recommendation_dataset/train.csv')

In [3]:
train_df.shape

(903916, 4)

In [4]:
train_df.head(10)

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933
5,4576_6,4576,6,CI25135
6,4576_7,4576,7,CI23975
7,4576_8,4576,8,CI25126
8,4576_9,4576,9,CI24915
9,4576_10,4576,10,CI24957


# Check for null values

In [5]:
train_df.isnull().sum()

user_sequence         0
user_id               0
challenge_sequence    0
challenge             0
dtype: int64

In [6]:
train_df.isna().sum()

user_sequence         0
user_id               0
challenge_sequence    0
challenge             0
dtype: int64

# Check if 13 sequences exist for every user

In [7]:
train_df['challenge_sequence'].value_counts()

13    69532
12    69532
11    69532
10    69532
9     69532
8     69532
7     69532
6     69532
5     69532
4     69532
3     69532
2     69532
1     69532
Name: challenge_sequence, dtype: int64

# Import Category dataset

In [8]:
category_df = pd.read_csv('/Users/amitagarwal/GreyAtom/Hackathon/mckinesy_recommendation_dataset/challenge_data.csv')

In [9]:
category_df.shape

(5606, 9)

# Explore Category dataset

In [10]:
category_df.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


# Check for null values

In [11]:
category_df.isnull().sum()

challenge_ID               0
programming_language       0
challenge_series_ID       12
total_submissions        352
publish_date               0
author_ID                 39
author_gender             97
author_org_ID            248
category_id             1841
dtype: int64

In [12]:
category_df['programming_language'].value_counts()

1    5217
2     319
3      70
Name: programming_language, dtype: int64

In [13]:
category_df.describe()

Unnamed: 0,programming_language,total_submissions,category_id
count,5606.0,5254.0,3765.0
mean,1.081877,348.362581,81.083665
std,0.316487,1044.810816,56.367797
min,1.0,2.0,22.0
25%,1.0,67.0,36.0
50%,1.0,134.0,66.0
75%,1.0,297.0,113.0
max,3.0,43409.0,304.0


In [14]:
len(category_df['author_ID'].value_counts().index)

3484

In [15]:
len(category_df['author_org_ID'].value_counts().index)

1717

In [16]:
user_ids = train_df['user_id'].unique()
user_ids = user_ids[:10000]
# user_ids

In [17]:
def get_profile_of_user(user_id):
    df_for_user = train_df[train_df['user_id'] == user_id]
    ids_of_challenges_solved_by_user = list(df_for_user['challenge'])
    
    challeges_solved_by_user = category_df[category_df['challenge_ID'].isin(ids_of_challenges_solved_by_user)]
    
    # GET ALL PROGRAMMING LANGUAGES USED
    programming_languages_used = challeges_solved_by_user['programming_language'].unique()
    
    programming_languages_used_dict = dict()
    for language_id in programming_languages_used:
        programming_languages_used_dict['programming_language' + str(language_id)] = 1
    
    # GET ALL SERIES ATTEMPTED
    series_attempted = challeges_solved_by_user['challenge_series_ID'].unique()
    
    series_attempted_dict = dict()
    for series in series_attempted:
        series_attempted_dict[series] = 1
        
    # GET AUTHOR ORGANIZATIONS ATTEMPTED
    #author_orgs = challeges_solved_by_user['author_org_ID'].unique()
    
    #author_orgs_dict = dict()
    #for author in author_orgs:
        #author_orgs_dict[author] = 1
    
    new_df = pd.DataFrame([[user_id, programming_languages_used_dict, series_attempted_dict]], columns=['user_id','programming_languages', 'challenge_series_attempted'])
    return new_df


In [18]:

user_profile_df = pd.DataFrame(columns=['user_id','programming_languages', 'challenge_series_attempted'])

for user_id in user_ids:
    user_profile_df = user_profile_df.append(get_profile_of_user(user_id))

user_profile_df
user_profile_df = user_profile_df.set_index('user_id', drop=True)

programming_languages_df = user_profile_df['programming_languages'].apply(pd.Series)
programming_languages_df.index = user_profile_df.index
programming_languages_df = programming_languages_df.replace(np.nan, 0)
programming_languages_df.head()

challenge_series_attempted_df = user_profile_df['challenge_series_attempted'].apply(pd.Series)
challenge_series_attempted_df.index = user_profile_df.index
challenge_series_attempted_df = challenge_series_attempted_df.replace(np.nan, 0)
challenge_series_attempted_df.head()

#author_orgs_attempted_df = user_profile_df['author_orgs_attempted'].apply(pd.Series)
#author_orgs_attempted_df.index = user_profile_df.index
#author_orgs_attempted_df = author_orgs_attempted_df.replace(np.nan, 0)
#author_orgs_attempted_df.head()

user_profile_df = pd.concat([programming_languages_df, challenge_series_attempted_df], axis=1)

user_profile_df.head()

Unnamed: 0_level_0,programming_language1,programming_language2,programming_language3,SI2472,SI2463,SI2469,SI2477,SI2468,SI2462,SI2545,...,SI2734,SI2561,SI2537,SI2633,SI2436,SI2831,SI2854,SI2636,SI2764,SI2803
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4576,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4580,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4581,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4582,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4585,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
user_profile_df_train = user_profile_df[:8000]
user_profile_df_test = user_profile_df[8000:]

In [20]:
final = pd.DataFrame(cosine_similarity(user_profile_df), columns=user_profile_df.index, index=user_profile_df.index)
final

user_id,4576,4580,4581,4582,4585,4587,4590,4591,4592,4593,...,20452,20453,20454,20456,20457,20459,20460,20461,20463,20464
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4576,1.000000,0.804030,0.235702,0.824958,0.888889,0.703526,0.804030,0.707107,0.948683,0.235702,...,0.666667,0.235702,0.632456,0.235702,0.272166,0.235702,0.235702,0.235702,0.444444,0.235702
4580,0.804030,1.000000,0.213201,0.852803,0.904534,0.727273,0.727273,0.746203,0.762770,0.213201,...,0.603023,0.213201,0.476731,0.213201,0.369274,0.213201,0.213201,0.213201,0.402015,0.213201
4581,0.235702,0.213201,1.000000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,1.000000,...,0.235702,0.500000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000
4582,0.824958,0.852803,0.250000,1.000000,0.824958,0.639602,0.746203,0.750000,0.782624,0.250000,...,0.589256,0.250000,0.447214,0.250000,0.433013,0.250000,0.250000,0.250000,0.471405,0.250000
4585,0.888889,0.904534,0.235702,0.824958,1.000000,0.603023,0.703526,0.824958,0.843274,0.235702,...,0.666667,0.235702,0.527046,0.235702,0.272166,0.235702,0.235702,0.235702,0.333333,0.235702
4587,0.703526,0.727273,0.213201,0.639602,0.603023,1.000000,0.636364,0.533002,0.667424,0.213201,...,0.603023,0.213201,0.572078,0.213201,0.369274,0.213201,0.213201,0.213201,0.502519,0.213201
4590,0.804030,0.727273,0.213201,0.746203,0.703526,0.636364,1.000000,0.639602,0.858116,0.213201,...,0.502519,0.213201,0.476731,0.213201,0.246183,0.213201,0.213201,0.213201,0.402015,0.213201
4591,0.707107,0.746203,0.250000,0.750000,0.824958,0.533002,0.639602,1.000000,0.670820,0.250000,...,0.707107,0.250000,0.559017,0.250000,0.288675,0.250000,0.250000,0.250000,0.353553,0.250000
4592,0.948683,0.762770,0.223607,0.782624,0.843274,0.667424,0.858116,0.670820,1.000000,0.223607,...,0.632456,0.223607,0.600000,0.223607,0.258199,0.223607,0.223607,0.223607,0.421637,0.223607
4593,0.235702,0.213201,1.000000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,1.000000,...,0.235702,0.500000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000


In [21]:
check = final[9900:]
check

user_id,4576,4580,4581,4582,4585,4587,4590,4591,4592,4593,...,20452,20453,20454,20456,20457,20459,20460,20461,20463,20464
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20286,0.192450,0.174078,0.408248,0.204124,0.192450,0.174078,0.174078,0.204124,0.182574,0.408248,...,0.192450,0.408248,0.182574,0.408248,0.235702,0.408248,0.408248,0.408248,0.192450,0.408248
20289,0.235702,0.213201,0.500000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,0.500000,...,0.235702,0.500000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000
20292,0.235702,0.213201,0.500000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,0.500000,...,0.235702,0.500000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000
20293,0.235702,0.213201,0.500000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,0.500000,...,0.235702,0.500000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000
20294,0.666667,0.452267,0.353553,0.530330,0.500000,0.603023,0.603023,0.530330,0.632456,0.353553,...,0.500000,0.353553,0.632456,0.353553,0.408248,0.353553,0.353553,0.353553,0.500000,0.353553
20296,0.235702,0.213201,0.500000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,0.500000,...,0.235702,0.500000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000
20298,0.235702,0.213201,0.500000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,0.500000,...,0.235702,1.000000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000
20300,0.384900,0.348155,0.408248,0.408248,0.384900,0.348155,0.348155,0.408248,0.365148,0.408248,...,0.384900,0.408248,0.365148,0.408248,0.471405,0.408248,0.408248,0.408248,0.384900,0.408248
20302,0.603023,0.727273,0.213201,0.746203,0.703526,0.454545,0.545455,0.639602,0.572078,0.213201,...,0.402015,0.213201,0.286039,0.213201,0.246183,0.213201,0.213201,0.213201,0.301511,0.213201
20306,0.235702,0.213201,0.500000,0.250000,0.235702,0.213201,0.213201,0.250000,0.223607,0.500000,...,0.235702,0.500000,0.223607,0.500000,0.288675,0.500000,0.500000,0.500000,0.235702,0.500000


In [22]:
new_check = check.T

In [23]:
users = new_check.columns

In [24]:
most_similar_for_user = dict()
for user in users:
    most_similar_for_user[user] = new_check[user].sort_values(ascending=False)[1:2].index[0]

most_similar_for_user

{20286: 19950,
 20289: 15374,
 20292: 15374,
 20293: 15374,
 20294: 20227,
 20296: 15374,
 20298: 19875,
 20300: 7609,
 20302: 12150,
 20306: 15374,
 20307: 20378,
 20308: 16800,
 20309: 18040,
 20311: 19875,
 20313: 15374,
 20318: 5923,
 20320: 17266,
 20321: 19875,
 20322: 17266,
 20327: 13042,
 20328: 20328,
 20329: 15850,
 20331: 5609,
 20332: 10447,
 20334: 20235,
 20336: 7740,
 20341: 12109,
 20343: 9837,
 20345: 7365,
 20346: 15374,
 20348: 15374,
 20349: 7709,
 20351: 17242,
 20352: 8404,
 20354: 12376,
 20355: 15374,
 20357: 7010,
 20359: 19551,
 20361: 15374,
 20362: 15374,
 20364: 15374,
 20365: 7591,
 20366: 15374,
 20367: 15374,
 20368: 20328,
 20369: 7159,
 20371: 12165,
 20373: 16419,
 20375: 20375,
 20376: 19875,
 20377: 5226,
 20378: 14847,
 20379: 10154,
 20384: 8306,
 20386: 15374,
 20387: 8727,
 20388: 19889,
 20391: 18161,
 20394: 18760,
 20397: 17660,
 20398: 15420,
 20399: 19875,
 20400: 12992,
 20402: 7642,
 20404: 16757,
 20405: 10355,
 20406: 13279,
 20407: 10

In [25]:
next_3_challenges_for_user = dict()
for key in most_similar_for_user.keys():
    challeges_recommended_df =  train_df[(train_df['user_id'] == most_similar_for_user[key]) & (train_df['challenge_sequence']>10)]
    next_3_challenges_for_user[key] = challeges_recommended_df['challenge'].unique()
    
next_3_challenges_for_user

{20286: array(['CI26837', 'CI26844', 'CI27779'], dtype=object),
 20289: array(['CI27787', 'CI27788', 'CI27784'], dtype=object),
 20292: array(['CI27787', 'CI27788', 'CI27784'], dtype=object),
 20293: array(['CI27787', 'CI27788', 'CI27784'], dtype=object),
 20294: array(['CI24968', 'CI23691', 'CI23855'], dtype=object),
 20296: array(['CI27787', 'CI27788', 'CI27784'], dtype=object),
 20298: array(['CI27043', 'CI27047', 'CI27045'], dtype=object),
 20300: array(['CI24952', 'CI24951', 'CI24968'], dtype=object),
 20302: array(['CI24206', 'CI23812', 'CI25126'], dtype=object),
 20306: array(['CI27787', 'CI27788', 'CI27784'], dtype=object),
 20307: array(['CI23663', 'CI25295', 'CI24875'], dtype=object),
 20308: array(['CI25108', 'CI23708', 'CI23526'], dtype=object),
 20309: array(['CI24115', 'CI26054', 'CI23714'], dtype=object),
 20311: array(['CI27043', 'CI27047', 'CI27045'], dtype=object),
 20313: array(['CI27787', 'CI27788', 'CI27784'], dtype=object),
 20318: array(['CI24954', 'CI24952', 'CI

In [26]:
challeges_actually_solved_dict = dict()

for key in most_similar_for_user.keys():
    challeges_actually_solved_df = train_df[(train_df['user_id'] == key) & (train_df['challenge_sequence']>10)]
    challeges_actually_solved_dict[key] = challeges_actually_solved_df['challenge'].unique()
    
challeges_actually_solved_dict

{20286: array(['CI27784', 'CI27803', 'CI27782'], dtype=object),
 20289: array(['CI27805', 'CI27787', 'CI27785'], dtype=object),
 20292: array(['CI27797', 'CI27801', 'CI27799'], dtype=object),
 20293: array(['CI27799', 'CI27790', 'CI27809'], dtype=object),
 20294: array(['CI24968', 'CI23691', 'CI23855'], dtype=object),
 20296: array(['CI27797', 'CI27801', 'CI27799'], dtype=object),
 20298: array(['CI27039', 'CI27043', 'CI27034'], dtype=object),
 20300: array(['CI24992', 'CI24962', 'CI23812'], dtype=object),
 20302: array(['CI24527', 'CI25143', 'CI23812'], dtype=object),
 20306: array(['CI27809', 'CI27800', 'CI27806'], dtype=object),
 20307: array(['CI23702', 'CI25295', 'CI23648'], dtype=object),
 20308: array(['CI24926', 'CI25022', 'CI25179'], dtype=object),
 20309: array(['CI24150', 'CI24871', 'CI24917'], dtype=object),
 20311: array(['CI27034', 'CI27045', 'CI27044'], dtype=object),
 20313: array(['CI27809', 'CI27811', 'CI27807'], dtype=object),
 20318: array(['CI23880', 'CI23999', 'CI

In [27]:
"""Ben Hammer metrics page
"""
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [28]:
precision = list()

for user in challeges_actually_solved_dict.keys():
    actual = challeges_actually_solved_dict[user]
    predicted = next_3_challenges_for_user[user]
    print(actual, predicted)
    precision.append(apk(actual, predicted))

['CI27784' 'CI27803' 'CI27782'] ['CI26837' 'CI26844' 'CI27779']
['CI27805' 'CI27787' 'CI27785'] ['CI27787' 'CI27788' 'CI27784']
['CI27797' 'CI27801' 'CI27799'] ['CI27787' 'CI27788' 'CI27784']
['CI27799' 'CI27790' 'CI27809'] ['CI27787' 'CI27788' 'CI27784']
['CI24968' 'CI23691' 'CI23855'] ['CI24968' 'CI23691' 'CI23855']
['CI27797' 'CI27801' 'CI27799'] ['CI27787' 'CI27788' 'CI27784']
['CI27039' 'CI27043' 'CI27034'] ['CI27043' 'CI27047' 'CI27045']
['CI24992' 'CI24962' 'CI23812'] ['CI24952' 'CI24951' 'CI24968']
['CI24527' 'CI25143' 'CI23812'] ['CI24206' 'CI23812' 'CI25126']
['CI27809' 'CI27800' 'CI27806'] ['CI27787' 'CI27788' 'CI27784']
['CI23702' 'CI25295' 'CI23648'] ['CI23663' 'CI25295' 'CI24875']
['CI24926' 'CI25022' 'CI25179'] ['CI25108' 'CI23708' 'CI23526']
['CI24150' 'CI24871' 'CI24917'] ['CI24115' 'CI26054' 'CI23714']
['CI27034' 'CI27045' 'CI27044'] ['CI27043' 'CI27047' 'CI27045']
['CI27809' 'CI27811' 'CI27807'] ['CI27787' 'CI27788' 'CI27784']
['CI23880' 'CI23999' 'CI24918'] ['CI2495

In [29]:
np.mean(precision)

0.07666666666666667