In [299]:
pip install lightfm

Note: you may need to restart the kernel to use updated packages.


In [300]:
import pandas as pd   
import numpy as np     

In [301]:
# Read all our datasets

df_answer_scores = pd.read_csv('answer_scores.csv')
df_answers = pd.read_csv('answers.csv',parse_dates=['answers_date_added'])
df_comments = pd.read_csv('comments.csv')
df_emails = pd.read_csv('emails.csv')
df_group_memberships = pd.read_csv('group_memberships.csv')
df_groups = pd.read_csv('groups.csv')
df_matches = pd.read_csv('matches.csv')
df_professionals = pd.read_csv('professionals.csv',parse_dates=['professionals_date_joined'])
df_question_scores = pd.read_csv('question_scores.csv')
df_questions = pd.read_csv('questions.csv',parse_dates=['questions_date_added'])
df_school_memberships = pd.read_csv('school_memberships.csv')
df_students = pd.read_csv('students.csv',parse_dates=['students_date_joined'])
df_tag_questions = pd.read_csv('tag_questions.csv')
df_tag_users = pd.read_csv('tag_users.csv')
df_tags = pd.read_csv('tags.csv')

In [302]:
def generate_int_id(dataframe, id_col_name):
    """
    Generate unique integer id for users, questions and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})


#create features 
def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features


#Generate feature listing
def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features


#Calculate AUC score
def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

Data processing and feature selection

In [303]:
#Generate numeric identifier
df_professionals = generate_int_id(df_professionals, 'professionals_id_num')
df_students = generate_int_id(df_students, 'students_id_num')
df_questions = generate_int_id(df_questions, 'questions_id_num')
df_answers = generate_int_id(df_answers, 'answers_id_num')
df_answers.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0
1,ada720538c014e9b8a6dceed09385ee3,2aa47af241bf42a4b874c453f0381bd4,eb80205482e4424cad8f16bc25aa2d9c,2018-05-01 14:19:08+00:00,<p>Hi. I joined the Army after I attended coll...,1
2,eaa66ef919bc408ab5296237440e323f,cbd8f30613a849bf918aed5c010340be,eb80205482e4424cad8f16bc25aa2d9c,2018-05-02 02:41:02+00:00,"<p>Dear Priyanka,</p><p>Greetings! I have answ...",2
3,1a6b3749d391486c9e371fbd1e605014,7e72a630c303442ba92ff00e8ea451df,4ec31632938a40b98909416bdd0decff,2017-05-10 19:00:47+00:00,<p>I work for a global company who values high...,3
4,5229c514000446d582050f89ebd4e184,17802d94699140b0a0d2995f30c034c6,2f6a9a99d9b24e5baa50d40d0ba50a75,2017-10-13 22:07:33+00:00,I agree with Denise. Every single job I've had...,4


In [304]:
#Merge Dataset -
#All tags (q&a) are stored in a separate dataset. So firstly we merge those tags with questions and answers datasets.
#Then, we merge answers with quesitons because one question can have multiple answers.

#Clean Tag - 
df_tags = df_tags.dropna()
df_tags['tags_tag_name'] = df_tags['tags_tag_name'].str.replace('#', '')

# merge tag_questions with tags name
# then group all tags for each question into single rows
df_tags_question = df_tag_questions.merge(df_tags, how='inner',left_on='tag_questions_tag_id', right_on='tags_tag_id')
df_tags_question = df_tags_question.groupby(
    ['tag_questions_question_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
df_tags_question = df_tags_question.rename(columns={'tags_tag_name': 'questions_tag_name'})

df_tags_question.head()

Unnamed: 0,tag_questions_question_id,questions_tag_name
0,0003e7bf48f24b5c985f8fce96e611f3,"internship,technology,high-school,information-..."
1,0006609dd4da40dcaa5a83e0499aba14,"psychology,law"
2,000af224bc2f4e94a19f8b62ba279cc4,"biology,marine"
3,000b30fb534b41f7b716fa9ebf9c3f35,"teaching,exercise-science,school,exercise"
4,0018752e44b44e26bb74a0a43232b4d6,"math,puremathematics"


In [306]:
# merge tag_users with tags name 
# then group all tags for each user into single rows 
# after that rename the tag column name 
df_tags_pro = df_tag_users.merge(
    df_tags, how='inner',
    left_on='tag_users_tag_id', right_on='tags_tag_id')
df_tags_pro = df_tags_pro.groupby(
    ['tag_users_user_id'])['tags_tag_name'].apply(
        ','.join).reset_index()
df_tags_pro = df_tags_pro.rename(columns={'tags_tag_name': 'professionals_tag_name'})


In [307]:
df_tags_pro.head()

Unnamed: 0,tag_users_user_id,professionals_tag_name
0,00009a0f9bda43eba47104e9ac62aff5,"digital-media,script-writing,content-creation"
1,000196ef8db54b9a86ae70ad31745d04,accounting
2,0008138be908438e8944b21f7f57f2c1,real-estate
3,000d4635e5da41e3bfd83677ee11dda4,"college,university,information-technology"
4,000e2b5714444d79a672bf927905135c,financial-services


In [287]:
df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4


In [308]:
# merge professionals and questions tags with main merge_dataset 
df_questions = df_questions.merge(
    df_tags_question, how='left',
    left_on='questions_id', right_on='tag_questions_question_id')
df_professionals = df_professionals.merge(
    df_tags_pro, how='left',
    left_on='professionals_id', right_on='tag_users_user_id')
df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0,,
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3,,
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4,,


In [321]:
# merge answers with questions 
# then merge professionals and questions score with that 
df_merge = df_answers.merge(
    df_questions, how='inner',
    left_on='answers_question_id', right_on='questions_id')
df_merge = df_merge.merge(
    df_professionals, how='inner',
    left_on='answers_author_id', right_on='professionals_id')
df_merge.head()


Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"lecture,college,professor",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e..."
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,7,0f1d6a4f276c4a05878dd48e03e52289,"college,building,soccer",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e..."
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e..."
3,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,"I am a Sociology, Political Science, and Inter...",47,acc611cfb5c44daa8a3d7d65dfffa5ff,"job-search,career-choice,job,college-jobs",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e..."
4,7410001e9a2f46df982cba38ba60486a,36ff3b3666df400f956f8335cf53e09e,b44409c0c83a43578f4f2d0a0e4d8ead,2016-12-08 02:26:05+00:00,<p>Hi Alexandra!</p>\n<p>Here is the site for ...,123,b44409c0c83a43578f4f2d0a0e4d8ead,2fe767de78fa4dfd83f0021cf7712064,2016-12-07 20:43:41+00:00,What majors do you recommend in order to be an...,I am currently enrolled in a World Musics clas...,54,b44409c0c83a43578f4f2d0a0e4d8ead,"ethnomusicology,musicology,world-music,music,w...",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e..."


In [323]:
#######################
# Generate some features for calculates weights
# that will use with interaction matrix 
#######################
#gettig count of how many question has a unit profession has answered 
#df_merge['num_of_ans_by_professional'] = df_merge.groupby(['answers_author_id'])['questions_id'].transform('count')
df_merge['num_ans_per_ques'] = df_merge.groupby(['questions_id'])['answers_id'].transform('count')
#df_merge['num_tags_professional'] = df_merge['professionals_tag_name'].str.split(",").str.len()
df_merge['num_tags_question'] = df_merge['questions_tag_name'].str.split(",").str.len()
df_merge['num_tags_question'] = df_merge['num_tags_question'].fillna(0)
df_merge['num_tags_question']
df_merge.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,answers_id_num,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,num_ans_per_ques,num_tags_question
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14+00:00,<p>Hi!</p>\n<p>You are asking a very interesti...,0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26+00:00,Teacher career question,What is a maths teacher? what is a ma...,0,332a511f1569444485cf7a7a556a5e54,"lecture,college,professor",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",1,3.0
1,f3519ab99a1a4a13a8a9ecb814287d2a,36ff3b3666df400f956f8335cf53e09e,0f1d6a4f276c4a05878dd48e03e52289,2016-07-31 15:35:54+00:00,<p>Hi Rodrigo!</p>\n<p>The important thing to ...,11,0f1d6a4f276c4a05878dd48e03e52289,585ac233015447cc9e9a217044e515e1,2016-05-19 22:16:25+00:00,what kind of college could i go to for a soc...,I like soccer because i been playing sense i w...,7,0f1d6a4f276c4a05878dd48e03e52289,"college,building,soccer",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",1,3.0
2,825f6e316a5f48328d6f8af831df9940,36ff3b3666df400f956f8335cf53e09e,0149c6d63e214040b44d4a3789bb00ba,2018-04-15 23:08:46+00:00,<p>Congratulations on being interested in find...,71,0149c6d63e214040b44d4a3789bb00ba,34217a1861d640a58c85e033414cf9cb,2018-04-12 17:13:45+00:00,What is the best way to prepare for studying e...,"I am interested in Computational Neuroscience,...",33,0149c6d63e214040b44d4a3789bb00ba,"engineering,neuroscience,gradschool",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",2,3.0
3,f3fc23809cda472780fc565334f35000,36ff3b3666df400f956f8335cf53e09e,acc611cfb5c44daa8a3d7d65dfffa5ff,2018-08-14 10:37:01+00:00,<p>The most important thing that you can do is...,102,acc611cfb5c44daa8a3d7d65dfffa5ff,5b751a8ee4a047f7a08ce9eb5e43e5a2,2018-08-14 04:49:33+00:00,How should I prepare myself for my job search ...,"I am a Sociology, Political Science, and Inter...",47,acc611cfb5c44daa8a3d7d65dfffa5ff,"job-search,career-choice,job,college-jobs",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",1,4.0
4,7410001e9a2f46df982cba38ba60486a,36ff3b3666df400f956f8335cf53e09e,b44409c0c83a43578f4f2d0a0e4d8ead,2016-12-08 02:26:05+00:00,<p>Hi Alexandra!</p>\n<p>Here is the site for ...,123,b44409c0c83a43578f4f2d0a0e4d8ead,2fe767de78fa4dfd83f0021cf7712064,2016-12-07 20:43:41+00:00,What majors do you recommend in order to be an...,I am currently enrolled in a World Musics clas...,54,b44409c0c83a43578f4f2d0a0e4d8ead,"ethnomusicology,musicology,world-music,music,w...",36ff3b3666df400f956f8335cf53e09e,"Cleveland, Ohio",Mental Health Care,Assist with Recognizing and Developing Potential,2015-10-19 20:56:49+00:00,2410,36ff3b3666df400f956f8335cf53e09e,"engineering,computer-science,science,college,e...",2,6.0


In [325]:
print("Maximum # of answer per question : " + str(df_merge['num_ans_per_ques'].max()))
print("Maximum # of tag per questions : " + str(df_merge['num_tags_question'].max()))

Maximum # of answer per question : 58
Maximum # of tag per questions : 54.0


In [326]:
# select professionals answered questions tags 
# and stored as a dataframe
professionals_prev_ans_tags = df_merge[['professionals_id', 'questions_tag_name']]
professionals_prev_ans_tags.head()

Unnamed: 0,professionals_id,questions_tag_name
0,36ff3b3666df400f956f8335cf53e09e,"lecture,college,professor"
1,36ff3b3666df400f956f8335cf53e09e,"college,building,soccer"
2,36ff3b3666df400f956f8335cf53e09e,"engineering,neuroscience,gradschool"
3,36ff3b3666df400f956f8335cf53e09e,"job-search,career-choice,job,college-jobs"
4,36ff3b3666df400f956f8335cf53e09e,"ethnomusicology,musicology,world-music,music,w..."


In [327]:
# drop null values from that 
professionals_prev_ans_tags = professionals_prev_ans_tags.dropna()
# because professsionals answers multiple questions, 
# we group all of tags of each user into single row 
professionals_prev_ans_tags = professionals_prev_ans_tags.groupby(
    ['professionals_id'])['questions_tag_name'].apply(
        ','.join).reset_index()
professionals_prev_ans_tags.head()

Unnamed: 0,professionals_id,questions_tag_name
0,00009a0f9bda43eba47104e9ac62aff5,"digital-media,photography,engineer,military,na..."
1,000d4635e5da41e3bfd83677ee11dda4,"students,college,gap-year,college,career,unive..."
2,00271cc10e0245fba4a35e76e669c281,"animation,entrepreneurship,animation,art,anima..."
3,003cc21be89d4e42bc4424131a378e86,"lawyer,legal,government,criminal-justice,law,c..."
4,0046ab8089c04b3a8df3f8c28621a818,"student,teacher,college,business,international..."


In [328]:
# drop duplicates tags from each professionals rows
professionals_prev_ans_tags['questions_tag_name'] = (
    professionals_prev_ans_tags['questions_tag_name'].str.split(',').apply(set).str.join(','))

In [329]:
# finally merge the dataframe with professionals dataframe 
df_professionals = df_professionals.merge(professionals_prev_ans_tags, how='left', on='professionals_id')
df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0,,,"resume,consulting"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1,,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","steps,bain,baby-sitting,sports,baby,employers,..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3,,,"math,motivation,school,business,college-majors..."
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4,,,


In [330]:
# join professionals tags and their answered tags 
# we replace nan values with ""
df_professionals['professional_all_tags'] = (df_professionals[['professionals_tag_name', 'questions_tag_name']].apply(lambda x: ','.join(x.dropna()),axis=1))

In [None]:
df_professionals.head()

In [333]:
df_professionals.head()

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined,professionals_id_num,tag_users_user_id,professionals_tag_name,questions_tag_name,professional_all_tags
0,9ced4ce7519049c0944147afb75a8ce3,,,,2011-10-05 20:35:19+00:00,0,,,"resume,consulting","resume,consulting"
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,,,2011-10-05 20:49:21+00:00,1,,,,
2,0c673e046d824ec0ad0ebe012a0673e4,"New York, New York",,,2011-10-18 17:31:26+00:00,2,0c673e046d824ec0ad0ebe012a0673e4,"consulting,consulting,consulting,consulting,co...","steps,bain,baby-sitting,sports,baby,employers,...","consulting,consulting,consulting,consulting,co..."
3,977428d851b24183b223be0eb8619a8c,"Boston, Massachusetts",,,2011-11-09 20:39:29+00:00,3,,,"math,motivation,school,business,college-majors...","math,motivation,school,business,college-majors..."
4,e2d57e5041a44f489288397c9904c2b2,,,,2011-12-10 22:14:44+00:00,4,,,,


In [335]:
#Cleaning Values 
#df_questions['score'] = df_questions['score'].fillna(0)
#df_questions['score'] = df_questions['score'].astype(int)
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].fillna('No Tag')
# remove duplicates tags from each questions 
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].str.split(',').apply(set).str.join(',')

# fill nan with 'No Tag' if any 
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].fillna('No Tag')
# replace "" with "No Tag", because previously we replace nan with ""
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].replace('', 'No Tag')
df_professionals['professionals_location'] = df_professionals['professionals_location'].fillna('No Location')
df_professionals['professionals_industry'] = df_professionals['professionals_industry'].fillna('No Industry')

# remove duplicates tags from each professionals 
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].str.split(',').apply(set).str.join(',')

# remove some null values from df_merge
df_merge['num_ans_per_ques']  = df_merge['num_ans_per_ques'].fillna(0)
#df_merge['num_tags_professional'] = df_merge['num_tags_professional'].fillna(0)
df_merge['num_tags_question'] = df_merge['num_tags_question'].fillna(0)

In [336]:
#Building the lightFM model

# generating features list for mapping 
question_feature_list = generate_feature_list(
    df_questions,
    ['questions_tag_name'])

In [337]:
professional_feature_list = generate_feature_list(
    df_professionals,
    ['professional_all_tags'])
professional_feature_list

0                                            resume
1                                        consulting
2                                            No Tag
3                                             steps
4                                              bain
5                                      baby-sitting
6                                              baby
7                                            sports
8                                         employers
9                                          mckinsey
10                                    career-choice
11                                       automotive
12                                 anesthesiologist
13                                            nurse
14                                          daycare
15                                         business
16                                     career-paths
17                                     compensation
18                                          science
19          

In [363]:
# calculate our weight value 
df_merge['total_weights'] = 1 / (
    df_merge['num_tags_question'])


# creating features for feeding into lightfm 
df_questions['question_features'] = create_features(
    df_questions, ['questions_tag_name'], 
    'questions_id_num')

df_professionals['professional_features'] = create_features(
    df_professionals,
    ['professional_all_tags'],
    'professionals_id_num')

In [364]:
df_merge['total_weights']

0        0.333333
1        0.333333
2        0.333333
3        0.250000
4        0.166667
5        0.125000
6        0.500000
7        0.250000
8        0.250000
9        0.250000
10       0.500000
11       0.333333
12       0.200000
13            inf
14       0.500000
15       1.000000
16       0.333333
17       1.000000
18       0.250000
19       0.333333
20       0.250000
21       0.333333
22       1.000000
23       0.333333
24       0.500000
25       0.250000
26       0.500000
27       1.000000
28       1.000000
29       1.000000
           ...   
50076    0.125000
50077    0.200000
50078    0.500000
50079    0.500000
50080    0.500000
50081    0.500000
50082         inf
50083         inf
50084         inf
50085         inf
50086    0.333333
50087    0.333333
50088    0.100000
50089    0.200000
50090    0.166667
50091    0.125000
50092    0.166667
50093    0.166667
50094    0.166667
50095    0.333333
50096    0.500000
50097    0.250000
50098         inf
50099    1.000000
50100    0

In [365]:
df_merge['total_weights']=df_merge['total_weights'].replace(np.inf, 0)

In [366]:
df_merge['total_weights']

0        0.333333
1        0.333333
2        0.333333
3        0.250000
4        0.166667
5        0.125000
6        0.500000
7        0.250000
8        0.250000
9        0.250000
10       0.500000
11       0.333333
12       0.200000
13       0.000000
14       0.500000
15       1.000000
16       0.333333
17       1.000000
18       0.250000
19       0.333333
20       0.250000
21       0.333333
22       1.000000
23       0.333333
24       0.500000
25       0.250000
26       0.500000
27       1.000000
28       1.000000
29       1.000000
           ...   
50076    0.125000
50077    0.200000
50078    0.500000
50079    0.500000
50080    0.500000
50081    0.500000
50082    0.000000
50083    0.000000
50084    0.000000
50085    0.000000
50086    0.333333
50087    0.333333
50088    0.100000
50089    0.200000
50090    0.166667
50091    0.125000
50092    0.166667
50093    0.166667
50094    0.166667
50095    0.333333
50096    0.500000
50097    0.250000
50098    0.000000
50099    1.000000
50100    0

In [367]:
########################
# Dataset building for lightfm
########################

# define our dataset variable
# then we feed unique professionals and questions ids
# and item and professional feature list
# this will create lightfm internel mapping
from lightfm.data import Dataset
dataset=Dataset()
dataset.fit(set(df_professionals['professionals_id_num']), set(df_questions['questions_id_num']),item_features=question_feature_list, user_features=professional_feature_list)


# now we are building interactions matrix between professionals and quesitons
# we are passing professional and questions id as a tuple
# e.g -> pd.Series((pro_id, question_id), (pro_id, questin_id))
# then we use lightfm build in method for building interactions matrix
df_merge['author_question_id_tuple'] = list(zip(
    df_merge.professionals_id_num, df_merge.questions_id_num, df_merge.total_weights))

interactions, weights = dataset.build_interactions(
    df_merge['author_question_id_tuple'])



# now we are building our questions and professionals features
# in a way that lightfm understand.
# we are using lightfm build in method for building
# questions and professionals features 
questions_features = dataset.build_item_features(
    df_questions['question_features'])

professional_features = dataset.build_user_features(
    df_professionals['professional_features'])

In [None]:
#sparse matrix to store features when it has value 
questions_features

In [None]:
df_questions['question_features']

In [369]:
from lightfm import LightFM

In [370]:
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2019)

model.fit(
    interactions,
    item_features=questions_features,
    user_features=professional_features, sample_weight=weights,
    epochs=5, num_threads=4, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


<lightfm.lightfm.LightFM at 0x7fe4e29590f0>

In [375]:

from lightfm.evaluation import auc_score

In [376]:
calculate_auc_score(model, interactions, questions_features, professional_features)

0.89395785

In [None]:
df_merge[:3]

In [373]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def recommend_questions(professional_ids):
     
    for professional in professional_ids:
        # print their previous answered question title
        previous_q_id_num = df_merge.loc[df_merge['professionals_id_num'] == professional][:3]['questions_id_num']
        df_previous_questions = df_questions.loc[df_questions['questions_id_num'].isin(previous_q_id_num)]
        print('Professional Id (' + str(professional) + "): Previous Answered Questions")
        display_side_by_side(
            df_previous_questions[['questions_title', 'question_features']],
            df_professionals.loc[df_professionals.professionals_id_num == professional][['professionals_id_num','professionals_tag_name']])
        
        # predict
        discard_qu_id = df_previous_questions['questions_id_num'].values.tolist()
        df_use_for_prediction = df_questions.loc[~df_questions['questions_id_num'].isin(discard_qu_id)]
        questions_id_for_predict = df_use_for_prediction['questions_id_num'].values.tolist()
        
        scores = model.predict(
            professional,
            questions_id_for_predict,
            item_features=questions_features,
            user_features=professional_features)
        
        df_use_for_prediction['scores'] = scores
        df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:8]
        print('Professional Id (' + str(professional) + "): Recommended Questions: ")
        display(df_use_for_prediction)
    

    

In [374]:
recommend_questions([1200])

Professional Id (1200): Previous Answered Questions


Unnamed: 0,questions_title,question_features

Unnamed: 0,professionals_id_num,professionals_tag_name
1200,1200,"marketing,strategy,entrepreneurship,management..."


Professional Id (1200): Recommended Questions: 


Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,questions_id_num,tag_questions_question_id,questions_tag_name,question_features,scores
4355,8dcd62484c41457d8c7cefd4c79dd535,10716d4309be422099010c77bd068793,2016-05-26 02:25:46+00:00,Sales representative,What are some techniques which you should lear...,4355,8dcd62484c41457d8c7cefd4c79dd535,sales,"(4355, [sales])",0.278469
14595,cba34ffb1b764ddcb0358bc719976a2f,5e30acf2b1df483790a3106ac9fe6962,2016-05-14 14:10:05+00:00,If you are not a marketing major but have a pa...,I am going to have a degree in Music Business ...,14595,cba34ffb1b764ddcb0358bc719976a2f,marketing,"(14595, [marketing])",-0.067829
10108,86e7e9ee9a0644179e0aafaf3c029085,256d071812804d5eaa799e704022ea3a,2014-02-10 15:47:31+00:00,What do I need to qualify for a customer servi...,I think there are some jobs in my area where I...,10108,86e7e9ee9a0644179e0aafaf3c029085,"support,customer-service","(10108, [support, customer-service])",-0.171781
9354,9fbba4ac02f2441caf71a6159a59b33b,867e779f9dad4020925e986151ca4be4,2017-05-18 13:21:49+00:00,How can you improve your Public speaking?,Had a public speaking class but still not conf...,9354,9fbba4ac02f2441caf71a6159a59b33b,public-speaking,"(9354, [public-speaking])",-0.258482
3484,51c9cd85050a45878e97b762c791426a,4e533ef0f23542b090a781b2391ffc57,2014-11-04 23:58:43+00:00,What is the best way to get into marketing?,I am interested in marketing #marketing,3484,51c9cd85050a45878e97b762c791426a,marketing,"(3484, [marketing])",-0.302169
3689,826ea84b99f947299740ae07d3a4667f,df2fa237ce144135bc0e664abe4f6dac,2012-03-26 12:57:50+00:00,sales info!,What is a successful way to become a great sal...,3689,826ea84b99f947299740ae07d3a4667f,"sales,business","(3689, [sales, business])",-0.313557
22478,eaa6ed701a7c4723adfc7ae61858d7d6,a4da6ee789704b31a4e8cd000aeb8c2d,2015-09-03 18:01:28+00:00,whats the best marketing school?,i wanna be in marketing #web-marketing,22478,eaa6ed701a7c4723adfc7ae61858d7d6,web-marketing,"(22478, [web-marketing])",-0.315439
6102,ed06ef6e57194d53b5730cbdab744502,5b030dc181f744a5be10343b111cfe66,2016-05-14 00:50:30+00:00,What do you recommend to get involved with in ...,Nothing excites me more than going into a fiel...,6102,ed06ef6e57194d53b5730cbdab744502,marketing,"(6102, [marketing])",-0.369536
