In [18]:
import json
import pandas as pd
import numpy as np
import string
import os
from sklearn.metrics.pairwise import cosine_similarity

## Load Data

#### Loading Question-Answer data

In [19]:
filename = "D:\\WebScience\\web_science_dataset.jsonl"
json_data = []
json_data_list = []


with open(filename) as f:
    json_data = f.readlines()
    
for item in json_data:
    json_data_list.append(json.loads(item))
    
df = pd.DataFrame(json_data_list)
df.head()

question_df = pd.DataFrame(df[['question','questionId','category','categoryId']])
question_df.head()

Unnamed: 0,question,questionId,category,categoryId
0,Can headbanging cause brain damage?,14138,medical-science,2
1,Does the Shangri-La diet work (according to it...,10103,nutrition,0
2,"Can phobias be genetic, but created in one gen...",18713,psychology,4
3,Do 40% of U.S. Americans think that global war...,36010,climate-change,1
4,Does boiling the same water twice make it dang...,11118,nutrition,0


In [20]:
train_file = "recommender_train.tsv"
test_file = "recommender_test.tsv"
train = pd.read_csv(train_file,delimiter="\t")
test =  pd.read_csv(test_file,delimiter="\t")

In [21]:
train.head()
train

Unnamed: 0,userID,questionID,rating
0,5205,17488,2
1,5205,8080,2
2,5205,36393,2
3,5205,44399,2
4,5205,44167,2
...,...,...,...
2339,5432,8054,1
2340,5432,17594,2
2341,5432,22562,2
2342,5432,22250,2


In [22]:
test.head()
test.dtypes

userID         int64
questionID     int64
recommend     object
dtype: object

#### Loading Users DF dictionary

In [23]:
question_df = question_df.astype({'questionId': 'int64'})
category_map = dict(zip(question_df['category'],question_df['categoryId']))
ques_cat_map = dict(zip(question_df['questionId'],question_df['categoryId']))
questext_id_map = dict(zip(question_df['questionId'],question_df['question']))

# loading data 
folder_path = "C:\\Users\\Reen\\Desktop\\web science\\crowdsourced_data\\submissions_fixed_anonymized"

user_df_dict = {}

for root,dirs,files in os.walk(folder_path):
    for file in files:
        filename = folder_path+"\\"+file
        df = pd.read_csv(filename)
        df = df.drop(['Question','Answer URL','Answer Label','Answer Quality','Factual'],axis=1)
        
        #add category id column
        df['categoryId'] = df['questionId'].map(ques_cat_map)
        #df = df.dropna(axis=0)
    
        #add category column
        category_map_rev = {v:k for k,v in category_map.items()}
        df['category'] = df['categoryId'].map(category_map_rev)
        
        #add clean question column
        df['question'] = df['questionId'].map(questext_id_map)
        
        key = file.replace('.csv','')
        key = int(key.replace('WS',''))
        
        user_df_dict[key] = df

### Preprocessing

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [25]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return (' '.join(lemmatizer.lemmatize(w) for w in word_tokenize(text)))

In [26]:
def preprocess_text(df):    
    
    # convert to lowercase
    df = df.str.lower()
    
    # remove punctuation
    translator = str.maketrans('', '', string.punctuation) 
    df = df.str.translate(translator)
    
    # remove digits
    translator = str.maketrans('', '', string.digits) 
    df = df.str.translate(translator)
    
    # remove leading/trailing whitespaces
    df = df.str.strip()
    
    #lemmatize
    df = df.apply(lemmatize_text)
    
    return df

In [27]:
# add preprocessed column to main dataframe
def get_preprocessed_df(df,qid_rowid_map):
    
    preprocessed_data = preprocess_text(df['question'])
    preprocessed_data = preprocessed_data.rename('preprocessed question')
    
    resulting_df = pd.concat([df,preprocessed_data],axis=1)    
    
    resulting_df['row QID'] = resulting_df['questionId'].map(qid_rowid_map)
    #resulting_df.head()
    
    return resulting_df

#### Load data for recommender systems

In [28]:
# returns subset of dataframe of a particular category
def get_category_question_df(category_id,df):
    
    ques_category_df = df[df['categoryId']==category_id]
    return ques_category_df

## Word2Vec

In [29]:
from gensim.models import KeyedVectors
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords

In [30]:
EMBEDDING_FILE = "C:\\Users\\Reen\\Desktop\\web science\\WordEmbeddings\\GoogleNews-vectors-negative300.bin.gz"
word_to_vec_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True,limit=50000)

In [31]:
def get_mean_vector(word2vec_model, words):
    
    # remove out-of-vocabulary words
    words_ = [word for word in words if word in word2vec_model.vocab]
    if len(words_) >= 1:
        return np.mean(word2vec_model[words_], axis=0)

In [44]:
def get_word2vec_matrix(k,df):
    vectors = []
    for item in question_df['question'].values:
        item = (remove_stopwords(item))
        vector = (get_mean_vector(word_to_vec_model, item.split(" ")))
        if(vector is not None):
            vectors.append(vector)
        else:
            vectors.append(np.zeros(300))
    
    vectors_ = np.array(vectors)
    return vectors_

## TFIDF

In [45]:
def get_topk_vocab_tfidf(k,df):
    
    vectorizer = TfidfVectorizer(stop_words="english") #max_features=k
    response = vectorizer.fit_transform(df['preprocessed question'])
    response = response.todense()
    #response.shape
    
    features = np.array(vectorizer.get_feature_names())
    sorted_ = np.argsort(response).flatten()[::-1]
    
    sorted_indices = np.asarray((sorted_[0,:k]))[0]
        
    top_k_tfidf_matrix = np.array([top_k_tfidf_row_vector(response,row_idx,sorted_indices) for row_idx in range(len(df))])
   
    return top_k_tfidf_matrix

In [46]:
def top_k_tfidf_row_vector(tfidf_matrix,row_idx,top_k_indices):
    row_vec = np.asarray(tfidf_matrix[row_idx])[0]
    #modified_row_vec = [row_vec[i] if i in top_k_indices else 0 for i in range(len(row_vec))]
    #return modified_row_vec
    return row_vec

# User Profile Vector Computation

In [47]:
def compute_user_profile_vectors(category_ids,train_liked,user_df_dict,top_k_tfidf_matrix,qid_rowid_map):
    
    """
    category_ids: list
    train_liked: df containing only liked items
    user_df_dict: dictionary of all user dataframes
    """
    
    user_profile_vectors = []
    users = []
    
    for user in user_df_dict.keys():
        
        liked_qids = set((train_liked.loc[(train_liked['userID']==user)])['questionID'])
        user_df = user_df_dict[user]
        
        cat_liked_df = user_df.loc[(user_df['categoryId'].isin(category_ids)) & (user_df['questionId'].isin(liked_qids))]
        
        if(len(cat_liked_df)!=0):
            
            #print(len(cat_liked_df))
            
            qids = list(set(cat_liked_df['questionId']))
            row_ids = [qid_rowid_map[qid] for qid in qids]
            
            user_profile_vector = np.average(top_k_tfidf_matrix[row_ids,:],axis=0)
            #print(user_profile_vector.shape)
            
            user_profile_vectors.append(user_profile_vector)
            users.append(user)
    
    
    userid_row_map = dict(zip(users,list(range(0,len(users)))))
    profile_vector_mat = np.array(user_profile_vectors)
    
    return userid_row_map,profile_vector_mat
        

# Cosine Similarity

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
def compute_cosine_sim(profile_vectors):
    return cosine_similarity(profile_vectors)

### User specific liked/not liked items

In [50]:
def get_category_specific_user_rated_items(category_ids,df,user_df_dict):
    
    """
    df: either the df with recommend = yes or recommend = no
    """
    
    user_rated_dict = {}
    
    for user in user_df_dict.keys():
        
        #get user rated qids from liked df
        qids = set((df.loc[(df['userID']==user)])['questionID'])
        user_df = user_df_dict[user]
        
        #filter quids according to category
        cat_df = user_df.loc[(user_df['categoryId'].isin(category_ids)) & (user_df['questionId'].isin(qids))]
        #print(cat_df.head())
        
        if(len(cat_df)!=0):
            user_rated_dict[user] = list(set(cat_df['questionId']))
    
    return user_rated_dict
        
    

# Ranking

In [51]:
def top_k_similar_users(k,cosine_matrix,userid_row_map):
    
    """ top_k_similar_users = {userid:{similar_users}}, similar_users={friend_id:similarity_score}
    """
    
    top_k_friends = {}
    user_map = {v: k for k, v in userid_row_map.items()}
    
    for userid in userid_row_map.keys():
        
        row_id = userid_row_map[userid]
        
        user_vec = cosine_matrix[row_id]
            
        sorted_indices = (np.argsort(user_vec)[::-1][:k+1])[1:]
        top_k_friends_scores = [user_vec[i] for i in sorted_indices]

        
        top_k_friends_ids = [user_map[i] for i in sorted_indices]

        top_k_friends[userid] = top_k_friends_ids

    return top_k_friends

# Recommendations

In [52]:
def get_category_specific_user_recommendations(top_k_friends,train_user_liked_items):
    
    """return {userid: recommendations_from_top_k} where recommendations_from_top_k is {friendid:recommended question ids}
    """
        
    userids = top_k_friends.keys()
    
    top_k_friend_recommendations = {}
    
    for userid in userids:
        
        # top k friends
        top_k_friends_current = top_k_friends[userid]
        
        recommended = {}
        
        # find items liked by each friend
        for friend in top_k_friends_current:
        
            if(friend in train_user_liked_items.keys()):
                recommended[friend] = train_user_liked_items[friend]
                
        if(len(recommended)):  
            top_k_friend_recommendations[userid] = recommended 
    
    return top_k_friend_recommendations 
        

In [53]:
def compute_accuracy(top_k_friend_recommendations,true_liked_items,true_not_liked_items):
    
    test_userids = true_liked_items.keys()
    acc = 0
    
    total_acc_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    
    final_prec = final_recall = final_f1 = 0
    
    for userid in test_userids:
        
        true_items_l = true_liked_items[userid]
        
        true_items_nl = []
        if(userid) in true_not_liked_items.keys():
            true_items_nl = true_not_liked_items[userid]
        
        true_positives = 0
        true_negatives = 0
        false_positives = 0
        false_negatives = 0
        
        if(userid in top_k_friend_recommendations.keys()):
            
            
            recommended_items = top_k_friend_recommendations[userid].values()
            recommendations = [item for sublist in recommended_items for item in sublist]
            
            true_positives = float(len(set(recommendations).intersection(set(true_items_l))))
            false_negatives = len(true_items_l) - true_positives
            
            if(true_items_nl):
                false_positives = float(len(set(recommendations).intersection(set(true_items_nl))))
                true_negatives = len(true_items_nl) - false_positives
                
                
    
        user_acc = float((true_positives)+true_negatives) / float(len(true_items_l)+len(true_items_nl))
        total_acc_list.append(user_acc)
        
        
        user_prec = user_recall = 0
        
        if((true_positives)!=0 or (false_positives)!=0):
            user_prec = float(true_positives / (true_positives + false_positives))
            precision_list.append(user_prec)
        
        if((true_positives)!=0 or (true_positives)!=0):
            user_recall = float(true_positives / (true_positives + false_negatives))
            recall_list.append(user_recall)
        
        if(user_prec!=0 or user_recall!=0):
            user_f1 = (2 * user_prec * user_recall) / (user_prec + user_recall)
            f1_list.append(user_f1)

    final_acc = float(sum(total_acc_list)/len(total_acc_list))
    
    if(precision_list):
        final_prec = float(sum(precision_list)/len(precision_list))
    if(recall_list):
        final_recall = float(sum(recall_list)/len(recall_list))
    if(f1_list):
        final_f1 = float(sum(f1_list)/len(f1_list))
    
    return (final_acc,final_prec,final_recall,final_f1)

# Main

In [69]:
#defining some globals

train_liked = train.loc[train['rating']!=1]
test_liked = test.loc[test['recommend']=="Yes"]
test_not_liked = test.loc[test['recommend']=="No"]

k_friends=10
k = 5

In [70]:

if __name__ == "__main__":
    
    
    accuracy_all = []
    precision_all = []
    recall_all = []
    f1_all = []
    
    for i in range(5):

        """Train"""
        train_category_id = i

        #get category specific df
        cat_ques_df = get_category_question_df(train_category_id,question_df)
        qid_rowid_map = {v:k for k,v in enumerate(cat_ques_df['questionId'])}

        #preprocess df
        result_df = get_preprocessed_df(cat_ques_df,qid_rowid_map)

        #tfidf matrix for all questions in a particular category
        word2vec_matrix = get_word2vec_matrix(k,df)
        print(word2vec_matrix.shape)
        
        #get user profile vectors
        userid_row_map,profile_vector_mat = compute_user_profile_vectors([train_category_id],train_liked,user_df_dict,word2vec_matrix,qid_rowid_map)

        #calculate similarity 
        similarity_matrix = compute_cosine_sim(profile_vector_mat)


        #get top k friends
        top_k_friends = top_k_similar_users(k_friends,similarity_matrix,userid_row_map)


        accuracy_vals = []
        precision_vals = []
        recall_vals = []
        f1_vals = []   

        #Test

        for j in range(5):

            test_category_id = j

            #get category specific items liked by users in the training set
            train_user_liked_items = get_category_specific_user_rated_items([test_category_id],train_liked,user_df_dict)

            #get top k friends recommendations based on test category
            recommendations = get_category_specific_user_recommendations(top_k_friends,train_user_liked_items)

            #liked items of those unique users {userid: liked_item}
            test_user_liked_items = get_category_specific_user_rated_items([test_category_id],test_liked,user_df_dict)

            #not liked items of those unique users {userid: not_liked}
            test_not_liked_items = get_category_specific_user_rated_items([test_category_id],test_not_liked,user_df_dict)


            #get recommendations of top k friends for each user in unique users in test set and compute accuracy
            (accuracy,precision,recall,f1) = compute_accuracy(recommendations,test_user_liked_items,test_not_liked_items)
            
            accuracy_vals.append(accuracy)
            precision_vals.append(precision)
            recall_vals.append(recall)
            f1_vals.append(f1)
            
        accuracy_all.append(accuracy_vals)
        precision_all.append(precision_vals)
        recall_all.append(recall_vals)
        f1_all.append(f1_vals)          



(1066, 300)
(1066, 300)
(1066, 300)
(1066, 300)
(1066, 300)


In [71]:
np.array(accuracy_all)

array([[0.51815222, 0.51410256, 0.47645503, 0.56160714, 0.50590476],
       [0.56625175, 0.47820513, 0.48306878, 0.41121032, 0.433     ],
       [0.56335866, 0.44871795, 0.61640212, 0.42371032, 0.39990476],
       [0.50914281, 0.55897436, 0.48624339, 0.45019841, 0.46909524],
       [0.60843734, 0.58076923, 0.46772487, 0.43700397, 0.38890476]])

In [65]:
# No.of users liking questions in each category

train_liked = train.loc[train['rating']==3]
train_nopref = train.loc[train['rating']==2]
train_notliked = train.loc[train['rating']==1]

category_users_dict_liked = {}
category_users_dict_nopref = {}
category_users_dict_notliked = {}

for i in range(5):
    train_user_liked_items = get_category_specific_user_rated_items([i],train_liked,user_df_dict)
    category_users_dict_liked[i] = len(train_user_liked_items)
    
for i in range(5):
    train_user_nopref_items = get_category_specific_user_rated_items([i],train_nopref,user_df_dict)
    category_users_dict_nopref[i] = len(train_user_nopref_items)
    
for i in range(5):
    train_user_notliked_items = get_category_specific_user_rated_items([i],train_notliked,user_df_dict)
    category_users_dict_notliked[i] = len(train_user_notliked_items)



In [66]:
category_users_dict_notliked

{0: 26, 1: 22, 2: 22, 3: 25, 4: 25}

In [67]:
category_users_dict_nopref

{0: 29, 1: 28, 2: 28, 3: 29, 4: 29}

In [68]:
category_users_dict_liked

{0: 24, 1: 18, 2: 24, 3: 19, 4: 22}