In [None]:
'''
    ****************** HYBRID RECOMMENDATION SYSTEM  **********************
    
This notebook provides implementation of Hybrid Recommendation System for the problem stated in below link. Please refer below 
link to understand the problem statement in better manner.

https://datahack.analyticsvidhya.com/contest/practice-problem-recommendation-engine/

In this notebook -
A. Data is transformed using Feature engineering concepts.
B. Several algorithms in combination has been put up to build the model to predict the 'Attempted_range' for given user and problem id
    a. cosine_similarity
    b. SVD
    c. Baseline
    d. NMF
    e. KNNBaseline item-item similarity
    f. KNNBaseline user-user similarity
    
C. I was able to get the second best accuracy (F1-score) i.e. '0.5103025641' with using KNNBaseline user-user similarity and Cosine
    similarity algorithm in combination. refer below link for leaderboard.

https://datahack.analyticsvidhya.com/contest/practice-problem-recommendation-engine/lb

'''

In [14]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from statistics import mean
import math

from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from numpy import array

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

In [3]:
# Part-1  Load the Problem and Users data into dataframe

problems = pd.read_csv('analytics_vidhya_recommendataion_engine_data/problem_data.csv', sep=',', error_bad_lines=False, encoding="latin-1")
users = pd.read_csv('analytics_vidhya_recommendataion_engine_data/user_data.csv', sep=',', error_bad_lines=False, encoding="latin-1")

In [4]:
problems.head()

Unnamed: 0,problem_id,level_type,points,tags
0,prob_3649,H,,
1,prob_6191,A,,
2,prob_2020,F,,
3,prob_313,A,500.0,"greedy,implementation"
4,prob_101,A,500.0,"constructive algorithms,greedy,math"


In [5]:
users.head()

Unnamed: 0,user_id,submission_count,problem_solved,contribution,country,follower_count,last_online_time_seconds,max_rating,rating,rank,registration_time_seconds
0,user_3311,47,40,0,,4,1504111645,348.337,330.849,intermediate,1466686436
1,user_3028,63,52,0,India,17,1498998165,405.677,339.45,intermediate,1441893325
2,user_2268,226,203,-8,Egypt,24,1505566052,307.339,284.404,beginner,1454267603
3,user_480,611,490,1,Ukraine,94,1505257499,525.803,471.33,advanced,1350720417
4,user_650,504,479,12,Russia,4,1496613433,548.739,486.525,advanced,1395560498


In [6]:
'''
 ***********    Part-2A  - Feature Engineering ********************  

Problems dataframe contains 'tags' column which contains list of values ... to convert into continious value for model training.

A. Get the unique tags from 'Tags' column 
B. Create a new column for each unqiue tag value
C. Fill these column with either 0 or 1 based on that tag presence on that row

'''

problems['tags'] = problems['tags'].fillna('')

unique_tags = []
df_tags = problems['tags']

for tag in problems['tags']:
    tag = str(tag)
    if tag is not '':
        tag_splits = tag.split(',')
        for s in tag_splits:
            if s not in unique_tags:
                unique_tags.append(s)

def chk_row_index_list(row, tag):
    if tag in row['tags']:
        return 1
    else:
        return 0
        
    
for tag in unique_tags:
    problems[tag] = problems.apply(
                lambda x: chk_row_index_list(x, tag), axis=1)
    

# problems = problems.dropna(subset=['level_type'])
problems['level_type'] = problems['level_type'].fillna('A')
problems.points = problems.groupby('level_type')['points'].apply(lambda x: x.fillna(x.mean()))
problems.points = problems.points.fillna(0.0)
problems.drop(['tags'], axis=1, inplace=True)

users.drop(['last_online_time_seconds', 'registration_time_seconds', 'country'], axis=1, inplace=True)

In [7]:
'''
 ***********    Part-2B  - Feature Engineering ********************
 A. Use lable encoder for converting 'Rank' and 'Level_Type' categorical columnn values into continious
 
'''

rank_values = array(users['rank'])
le_rank = preprocessing.LabelEncoder()
users['rank'] = le_rank.fit_transform(users['rank'])
rank_classes = le_rank.classes_

le_level_type = preprocessing.LabelEncoder()
problems['level_type'] = le_level_type.fit_transform(problems['level_type'])
level_classes = le_level_type.classes_

In [8]:
train_submissions = pd.read_csv('analytics_vidhya_recommendataion_engine_data/train_submissions.csv', sep=',', error_bad_lines=False, encoding="latin-1")
tempdf = pd.merge(problems, train_submissions, on='problem_id', how='inner')
final_df = pd.merge(users, tempdf, on='user_id', how='inner')
final_df.head()

Unnamed: 0,user_id,submission_count,problem_solved,contribution,follower_count,max_rating,rating,rank,problem_id,level_type,...,*special,geometry,graph matchings,string suffix structures,fft,matrices,schedules,2-sat,chinese remainder theorem,attempts_range
0,user_3311,47,40,0,4,348.337,330.849,3,prob_75,0,...,0,0,0,0,0,0,0,0,0,1
1,user_3311,47,40,0,4,348.337,330.849,3,prob_3508,0,...,0,0,0,0,0,0,0,0,0,1
2,user_3311,47,40,0,4,348.337,330.849,3,prob_6362,3,...,0,0,0,0,0,0,0,0,0,2
3,user_3311,47,40,0,4,348.337,330.849,3,prob_1308,2,...,0,0,0,0,0,0,0,0,0,1
4,user_3311,47,40,0,4,348.337,330.849,3,prob_1481,1,...,0,0,0,0,0,0,0,0,0,1


In [9]:
'''
 ***********    Part-2C  - Feature Engineering ********************
 A. Use lable encoder for converting 'user_id' and 'problem_id' categorical columnn values into continious values
 
'''

le_user_id = preprocessing.LabelEncoder()
users['user_id'] = le_user_id.fit_transform(users['user_id'])
final_df['user_id'] = le_user_id.transform(final_df['user_id'])
train_submissions['user_id'] = le_user_id.transform(train_submissions['user_id'])
user_classes = le_user_id.classes_

le_problem_id = preprocessing.LabelEncoder()
problems['problem_id'] = le_problem_id.fit_transform(problems['problem_id'])
final_df['problem_id'] = le_problem_id.transform(final_df['problem_id'])
train_submissions['problem_id'] = le_problem_id.transform(train_submissions['problem_id'])
problem_classes = le_problem_id.classes_

print('after data transformation-')
final_df.head()

after data transformation-


Unnamed: 0,user_id,submission_count,problem_solved,contribution,follower_count,max_rating,rating,rank,problem_id,level_type,...,*special,geometry,graph matchings,string suffix structures,fft,matrices,schedules,2-sat,chinese remainder theorem,attempts_range
0,2570,47,40,0,4,348.337,330.849,3,6267,0,...,0,0,0,0,0,0,0,0,0,1
1,2570,47,40,0,4,348.337,330.849,3,2788,0,...,0,0,0,0,0,0,0,0,0,1
2,2570,47,40,0,4,348.337,330.849,3,5959,3,...,0,0,0,0,0,0,0,0,0,2
3,2570,47,40,0,4,348.337,330.849,3,344,2,...,0,0,0,0,0,0,0,0,0,1
4,2570,47,40,0,4,348.337,330.849,3,536,1,...,0,0,0,0,0,0,0,0,0,1


In [10]:
# Part 3- Get the cosine similarity matrix for Problem and User dataframe
problem_cosine_similarity = cosine_similarity(problems)
user_cosine_similarity = cosine_similarity(users)

problem_cosine_similarity

array([[1.        , 0.77813551, 0.91730815, ..., 0.973738  , 0.79231861,
        0.78707369],
       [0.77813551, 1.        , 0.46369623, ..., 0.90069909, 0.99973759,
        0.22500447],
       [0.91730815, 0.46369623, 1.        , ..., 0.80256495, 0.48386933,
        0.96760873],
       ...,
       [0.973738  , 0.90069909, 0.80256495, ..., 1.        , 0.9104142 ,
        0.62596437],
       [0.79231861, 0.99973759, 0.48386933, ..., 0.9104142 , 1.        ,
        0.2472643 ],
       [0.78707369, 0.22500447, 0.96760873, ..., 0.62596437, 0.2472643 ,
        1.        ]])

In [12]:
# Part-4 - Get the list of user_id and problem_id and corresponding indices
users = users.reset_index()
user_ids = users['user_id']
user_indices = pd.Series(users.index, index=users['user_id'])

problems = problems.reset_index()
problem_ids = problems['problem_id']
problem_indices = pd.Series(problems.index, index=problems['problem_id'])

In [None]:
reader = Reader(rating_scale=(0, 9))
data = Dataset.load_from_df(train_submissions[['user_id', 'problem_id', 'attempts_range']], reader)

trainset, testset = train_test_split(data, test_size=0.25)

In [51]:
'''
    Part 5A - train the model using SVD algoirthm 
'''

svdalgo = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1, random_state=0)
svdalgo.fit(trainset)
svdpredictions = svdalgo.test(testset[:200])

print(accuracy.rmse(svdpredictions))

RMSE: 0.8900
0.8899748016154547


In [257]:
'''
    Part 5B - train the model using CoClustering algoirthm 
'''

CoClusteringalgo = CoClustering(n_epochs=100, random_state=42)
CoClusteringalgo.fit(trainset)
CoClusteringalgopredictions = CoClusteringalgo.test(testset[:200])

print(accuracy.rmse(CoClusteringalgopredictions))

RMSE: 1.0311
1.0311492681002459


In [None]:
'''
    Part 5C - train the model using NMF algoirthm 
'''

nmfalgo = NMF(n_factors=160, n_epochs=100, random_state=0)
nmfalgo.fit(trainset)
nmfpredictions = nmfalgo.test(testset[:200])

print(accuracy.rmse(nmfpredictions))

In [208]:
'''
    Part 5D - train the model using KNNBaseline Item-item similarity algoirthm 
'''

sim_options = {'name': 'pearson_baseline', 'user_based': False}
itemAlgo = KNNBaseline(sim_options=sim_options)
itemAlgo.fit(trainset)
itempredictions = itemAlgo.test(testset[:200])
print(accuracy.rmse(itempredictions))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0146
1.0146300424898553


In [52]:
'''
    Part 5E - train the model using KNNBaseline User-User similarity algoirthm 
'''

sim_options = {'name': 'pearson_baseline', 'user_based': True}
userAlgo = KNNBaseline(sim_options=sim_options)
userAlgo.fit(trainset)
itempredictions = userAlgo.test(testset[:200])
print(accuracy.rmse(itempredictions))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8877
0.8877278792589097


In [210]:
'''
    Part 5F - train the model using BaselineOnly algoirthm 
'''

bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
baseline_algo = BaselineOnly(bsl_options=bsl_options)
baseline_algo.fit(trainset)
itempredictions = baseline_algo.test(testset[:200])
print(accuracy.rmse(itempredictions))
# cross_validate(baseline_algo, data, measures=['RMSE'], cv=3, verbose=False)

Estimating biases using als...
RMSE: 0.9501
0.9500616452613694


In [None]:
'''
    Part 6A - Model Prediction using cosine similarity

Following are steps to predict 'attempted_ranges' for given user id and problem id -

A. Get the similar User IDs and Problems IDs for given user id and problem id respectively.

B. If final_df contains given user id and problem id then return its correponding 'attempted_ranges' column value

C. If not, then get the 'attempted_ranges' column value for each combination of user id and similar problem ids from final_df
    and compute mean of all 'attempted_ranges'  values
    
D. If final_df doesn't contain user id and simialar problem ids combination, then get the 'attempted_ranges' column value for each 
    combination of problem id and similar user ids from final_df and compute mean of all 'attempted_ranges'  values

E. if final_df doesn't contain problem id and simialar user id combination then get the 'attempted_ranges' column value for each 
    combination of similar problem ids and similar user ids from final_df and compute mean of all 'attempted_ranges'  values

C. If not, then return default 'attempted_range' value 
    
'''

def get_recommendations(user_id, problem_id):
    user_idx = user_indices[user_id]
    problem_idx = problem_indices[problem_id]
    
    user_sim_scores = list(enumerate(user_cosine_similarity[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    user_sim_scores = user_sim_scores[1:30]
    
    problem_sim_scores = list(enumerate(problem_cosine_similarity[problem_idx]))
    problem_sim_scores = sorted(problem_sim_scores, key=lambda x: x[1], reverse=True)
    problem_sim_scores = problem_sim_scores[1:30]
    
    user_top_indices = [i[0] for i in user_sim_scores]
    
    similar_user_ids = user_ids.iloc[user_top_indices]
    
    total_attempted_range = []
    for userid in similar_user_ids:
        attempted_ranges = final_df[(final_df['user_id'] == userid) & (final_df['problem_id'] == problem_id)]['attempts_range'].values
        if len(attempted_ranges) > 0:
            total_attempted_range.append(attempted_ranges[0])
    
    compute_avg_attempted_range = 0.0
    if len(total_attempted_range) > 0:
        compute_avg_attempted_range = mean(total_attempted_range)
        # print('for', user_id, ' and ', problem_id, ' attemted_range calcualted via user similarity with value: ', compute_avg_attempted_range)
    else:
        problem_top_indices = [i[0] for i in problem_sim_scores]
        similar_problem_ids = problem_ids.iloc[problem_top_indices]
        
        for problemid in similar_problem_ids:
            attempted_ranges = final_df[(final_df['user_id'] == user_id) & (final_df['problem_id'] == problemid)]['attempts_range'].values
            if len(attempted_ranges) > 0:
                total_attempted_range.append(attempted_ranges[0])
        
        if len(total_attempted_range) > 0:
            compute_avg_attempted_range = mean(total_attempted_range)
            # print('for', user_id, ' and ', problem_id, ' attemted_range calcualted via problem similarity with value: ', compute_avg_attempted_range)
        else:
            for userid in similar_user_ids:
                for problemid in similar_problem_ids:
                    attempted_ranges = final_df[(final_df['user_id'] == userid) & (final_df['problem_id'] == problemid)]['attempts_range'].values
                    if len(attempted_ranges) > 0:
                        total_attempted_range.append(attempted_ranges[0])
            
            if len(total_attempted_range) > 0:
                compute_avg_attempted_range = mean(total_attempted_range)
                # print('for', user_id, ' and ', problem_id, ' attemted_range calcualted via user-problem similarity with value: ', compute_avg_attempted_range)
            else:
                print('there is no matching user and problem')
    
    # compute_avg_attempted_range = mean(compute_avg_attempted_range)
    if (float(compute_avg_attempted_range) % 1) >= 0.5:
        compute_avg_attempted_range = math.ceil(compute_avg_attempted_range)
    else:
        compute_avg_attempted_range = math.floor(compute_avg_attempted_range)
    return compute_avg_attempted_range


# Test for single record where user_id is 2570 and problem_id is 6267
print('Predicted Attempted_range value: ', get_recommendations(2570, 6267))

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(train_submissions,
                                   test_size=0.20,
                                   random_state=42)
predicted_rating = []

#lets predict for first 10 test records
test1_df = test_df[:10]
for index, row in test1_df.iterrows():
    atempt_range = get_recommendations(row['user_id'], row['problem_id'])
    print('for', row['user_id'], ' and ', row['problem_id'], 'and ', row['attempts_range'], ' attemted_range calcualted with value: ', row['attempts_range'] , ' : ' , atempt_range)
    predicted_rating.append(atempt_range)

test1_df['predicted_rating'] = predicted_rating
test1_df.head()

Predicted Attempted_range value:  1
for 2148  and  6257 and  1  attemted_range calcualted with value:  1  :  1
for 1937  and  2451 and  2  attemted_range calcualted with value:  2  :  1
for 2451  and  751 and  2  attemted_range calcualted with value:  2  :  3
for 3356  and  1292 and  1  attemted_range calcualted with value:  1  :  2
for 3441  and  4675 and  1  attemted_range calcualted with value:  1  :  1
for 1909  and  6299 and  4  attemted_range calcualted with value:  4  :  5
for 611  and  1954 and  1  attemted_range calcualted with value:  1  :  1
for 2253  and  4015 and  2  attemted_range calcualted with value:  2  :  2
for 105  and  2755 and  4  attemted_range calcualted with value:  4  :  1
for 2937  and  1581 and  2  attemted_range calcualted with value:  2  :  1
for 1467  and  2610 and  2  attemted_range calcualted with value:  2  :  1
for 1829  and  1614 and  4  attemted_range calcualted with value:  4  :  1


In [247]:
'''
    Part-6B - use cosine similarity and baseline Algorithm to predict the 'attempted_range' value for given user id and problem id
'''

def get_recommendations_baseline(user_id, problem_id):
    user_idx = user_indices[user_id]
    problem_idx = problem_indices[problem_id]
    
    user_sim_scores = list(enumerate(user_cosine_similarity[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    user_sim_scores = user_sim_scores[1:30]
    
    problem_sim_scores = list(enumerate(problem_cosine_similarity[problem_idx]))
    problem_sim_scores = sorted(problem_sim_scores, key=lambda x: x[1], reverse=True)
    problem_sim_scores = problem_sim_scores[1:30]
    
    user_top_indices = [i[0] for i in user_sim_scores]
    similar_user_ids = user_ids.iloc[user_top_indices]
    
    problem_top_indices = [i[0] for i in problem_sim_scores]
    similar_problem_ids = problem_ids.iloc[problem_top_indices]
    
    total_attempted_range = []
    '''
    for problemid in similar_problem_ids:
        total_attempted_range.append(baseline_algo.predict(user_id, problemid).est)
    '''
    
    for userid in similar_user_ids:
        total_attempted_range.append(baseline_algo.predict(userid, problem_id).est)
    
    compute_avg_attempted_range = mean(total_attempted_range)
    if (float(compute_avg_attempted_range) % 1) >= 0.5:
        compute_avg_attempted_range = math.ceil(compute_avg_attempted_range)
    else:
        compute_avg_attempted_range = math.floor(compute_avg_attempted_range)
    
    return compute_avg_attempted_range

In [258]:
'''
    Part-6C - use cosine similarity and coclustering Algorithm to predict the 'attempted_range' value for given user id and problem id
'''

def get_recommendations_coclustering(user_id, problem_id):
    user_idx = user_indices[user_id]
    problem_idx = problem_indices[problem_id]
    
    user_sim_scores = list(enumerate(user_cosine_similarity[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    user_sim_scores = user_sim_scores[1:30]
    
    problem_sim_scores = list(enumerate(problem_cosine_similarity[problem_idx]))
    problem_sim_scores = sorted(problem_sim_scores, key=lambda x: x[1], reverse=True)
    problem_sim_scores = problem_sim_scores[1:30]
    
    user_top_indices = [i[0] for i in user_sim_scores]
    similar_user_ids = user_ids.iloc[user_top_indices]
    
    problem_top_indices = [i[0] for i in problem_sim_scores]
    similar_problem_ids = problem_ids.iloc[problem_top_indices]
    
    total_attempted_range = []
    '''
    for problemid in similar_problem_ids:
        total_attempted_range.append(baseline_algo.predict(user_id, problemid).est)
    '''
    
    for userid in similar_user_ids:
        total_attempted_range.append(CoClusteringalgo.predict(userid, problem_id).est)
    
    compute_avg_attempted_range = mean(total_attempted_range)
    if (float(compute_avg_attempted_range) % 1) >= 0.5:
        compute_avg_attempted_range = math.ceil(compute_avg_attempted_range)
    else:
        compute_avg_attempted_range = math.floor(compute_avg_attempted_range)
    
    return compute_avg_attempted_range

In [239]:
'''
    Part-6D - use cosine similarity and knnbaseline_item Algorithm to predict the 'attempted_range' value for given user id and problem id
'''

def get_recommendations_knnbaseline_item(user_id, problem_id):
    user_idx = user_indices[user_id]
    problem_idx = problem_indices[problem_id]
    
    user_sim_scores = list(enumerate(user_cosine_similarity[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    user_sim_scores = user_sim_scores[1:30]
    
    problem_sim_scores = list(enumerate(problem_cosine_similarity[problem_idx]))
    problem_sim_scores = sorted(problem_sim_scores, key=lambda x: x[1], reverse=True)
    problem_sim_scores = problem_sim_scores[1:30]
    
    user_top_indices = [i[0] for i in user_sim_scores]
    similar_user_ids = user_ids.iloc[user_top_indices]
    
    problem_top_indices = [i[0] for i in problem_sim_scores]
    similar_problem_ids = problem_ids.iloc[problem_top_indices]
    
    total_attempted_range = []
    
    '''
    for problemid in similar_problem_ids:
        total_attempted_range.append(itemAlgo.predict(user_id, problemid).est)
    '''
    
    for userid in similar_user_ids:
        total_attempted_range.append(itemAlgo.predict(userid, problem_id).est)
        
    compute_avg_attempted_range = mean(total_attempted_range)
    if (float(compute_avg_attempted_range) % 1) >= 0.5:
        compute_avg_attempted_range = math.ceil(compute_avg_attempted_range)
    else:
        compute_avg_attempted_range = math.floor(compute_avg_attempted_range)
    
    return compute_avg_attempted_range

In [53]:
'''
    Part-6E - use cosine similarity and knnbaseline_user Algorithm to predict the 'attempted_range' value for given user id and problem id
'''

def get_recommendations_knnbaseline_user(user_id, problem_id):
    user_idx = user_indices[user_id]
    problem_idx = problem_indices[problem_id]
    
    user_sim_scores = list(enumerate(user_cosine_similarity[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    user_sim_scores = user_sim_scores[1:30]
    
    problem_sim_scores = list(enumerate(problem_cosine_similarity[problem_idx]))
    problem_sim_scores = sorted(problem_sim_scores, key=lambda x: x[1], reverse=True)
    problem_sim_scores = problem_sim_scores[1:30]
    
    user_top_indices = [i[0] for i in user_sim_scores]
    similar_user_ids = user_ids.iloc[user_top_indices]
    
    problem_top_indices = [i[0] for i in problem_sim_scores]
    similar_problem_ids = problem_ids.iloc[problem_top_indices]
    
    total_attempted_range = []
    '''
    for problemid in similar_problem_ids:
        total_attempted_range.append(userAlgo.predict(user_id, problemid).est)
    '''
    
    for userid in similar_user_ids:
        total_attempted_range.append(userAlgo.predict(userid, problem_id).est)
        
    compute_avg_attempted_range = mean(total_attempted_range)
    if (float(compute_avg_attempted_range) % 1) >= 0.75:
        compute_avg_attempted_range = math.ceil(compute_avg_attempted_range)
    else:
        compute_avg_attempted_range = math.floor(compute_avg_attempted_range)
    
    return compute_avg_attempted_range

In [235]:
'''
    Part-6F - use cosine similarity and SVD Algorithm to predict the 'attempted_range' value for given user id and problem id
'''

def get_recommendations_svd(user_id, problem_id):
    user_idx = user_indices[user_id]
    problem_idx = problem_indices[problem_id]
    
    user_sim_scores = list(enumerate(user_cosine_similarity[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    user_sim_scores = user_sim_scores[1:30]
    
    problem_sim_scores = list(enumerate(problem_cosine_similarity[problem_idx]))
    problem_sim_scores = sorted(problem_sim_scores, key=lambda x: x[1], reverse=True)
    problem_sim_scores = problem_sim_scores[1:30]
    
    user_top_indices = [i[0] for i in user_sim_scores]
    similar_user_ids = user_ids.iloc[user_top_indices]
    
    problem_top_indices = [i[0] for i in problem_sim_scores]
    similar_problem_ids = problem_ids.iloc[problem_top_indices]
    
    total_attempted_range = []
    '''
    for problemid in similar_problem_ids:
        total_attempted_range.append(svdalgo.predict(user_id, problemid).est)
    '''
    
    for userid in similar_user_ids:
        total_attempted_range.append(svdalgo.predict(userid, problem_id).est)
        
    compute_avg_attempted_range = mean(total_attempted_range)
    if (float(compute_avg_attempted_range) % 1) >= 0.5:
        compute_avg_attempted_range = math.ceil(compute_avg_attempted_range)
    else:
        compute_avg_attempted_range = math.floor(compute_avg_attempted_range)
    
    return compute_avg_attempted_range

In [280]:
'''
    Part-6G - use cosine similarity and NMF Algorithm to predict the 'attempted_range' value for given user id and problem id
'''

def get_recommendations_nmf(user_id, problem_id):
    user_idx = user_indices[user_id]
    problem_idx = problem_indices[problem_id]
    
    user_sim_scores = list(enumerate(user_cosine_similarity[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    user_sim_scores = user_sim_scores[1:30]
    
    problem_sim_scores = list(enumerate(problem_cosine_similarity[problem_idx]))
    problem_sim_scores = sorted(problem_sim_scores, key=lambda x: x[1], reverse=True)
    problem_sim_scores = problem_sim_scores[1:30]
    
    user_top_indices = [i[0] for i in user_sim_scores]
    similar_user_ids = user_ids.iloc[user_top_indices]
    
    problem_top_indices = [i[0] for i in problem_sim_scores]
    similar_problem_ids = problem_ids.iloc[problem_top_indices]
    
    total_attempted_range = []
    '''
    for problemid in similar_problem_ids:
        total_attempted_range.append(nmfalgo.predict(user_id, problemid).est)
    '''
    
    for userid in similar_user_ids:
        total_attempted_range.append(nmfalgo.predict(userid, problem_id).est)
        
    compute_avg_attempted_range = mean(total_attempted_range)
    if (float(compute_avg_attempted_range) % 1) >= 0.66:
        compute_avg_attempted_range = math.ceil(compute_avg_attempted_range)
    else:
        compute_avg_attempted_range = math.floor(compute_avg_attempted_range)
    
    return compute_avg_attempted_range

In [None]:
# Part 7A-  Predict 'attemped_range' for test data via calling SVD algorithm function

train_df, test_df = train_test_split(train_submissions,
                                   test_size=0.20,
                                   random_state=42)

predicted_rating = []
test1_df = test_df
for index, row in test1_df.iterrows():
    atempt_range = get_recommendations_svd(row['user_id'], row['problem_id'])
    predicted_rating.append(atempt_range)

test1_df['predicted_rating'] = predicted_rating

In [None]:
# Part 7B-  Predict 'attemped_range' for test data via calling NMF algorithm function


train_df, test_df = train_test_split(train_submissions,
                                   test_size=0.20,
                                   random_state=42)

predicted_rating = []
test1_df = test_df
for index, row in test1_df.iterrows():
    atempt_range = get_recommendations_nmf(row['user_id'], row['problem_id'])
    print('for', row['user_id'], ' and ', row['problem_id'], 'and ', row['attempts_range'], ' attemted_range calcualted with value: ', row['attempts_range'] , ' : ' , atempt_range)
    predicted_rating.append(atempt_range)

test1_df['predicted_rating'] = predicted_rating

In [134]:
# Part 7C-  Predict 'attemped_range' for test data via calling knnbaseline_item algorithm function


train_df, test_df = train_test_split(train_submissions,
                                   test_size=0.20,
                                   random_state=42)

predicted_rating = []
test1_df = test_df
for index, row in test1_df.iterrows():
    atempt_range = get_recommendations_knnbaseline_item(row['user_id'], row['problem_id'])
    print('for', row['user_id'], ' and ', row['problem_id'], 'and ', row['attempts_range'], ' attemted_range calcualted with value: ', row['attempts_range'] , ' : ' , atempt_range)
    predicted_rating.append(atempt_range)

test1_df['predicted_rating'] = predicted_rating

compute_avg_attempted_range in ciel:  2
for 573  and  6203 and  1  attemted_range calcualted with value:  1  :  2
compute_avg_attempted_range in ciel:  2
for 2044  and  5173 and  2  attemted_range calcualted with value:  2  :  2
compute_avg_attempted_range in floor:  1
for 1327  and  4878 and  1  attemted_range calcualted with value:  1  :  1
compute_avg_attempted_range in ciel:  2
for 3159  and  3849 and  1  attemted_range calcualted with value:  1  :  2
compute_avg_attempted_range in ciel:  2
for 1329  and  5317 and  2  attemted_range calcualted with value:  2  :  2
compute_avg_attempted_range in floor:  2
for 216  and  4335 and  1  attemted_range calcualted with value:  1  :  2
for 2772  and  4514 and  3  attemted_range calcualted with value:  3  :  2
for 2757  and  5095 and  1  attemted_range calcualted with value:  1  :  2
for 555  and  965 and  1  attemted_range calcualted with value:  1  :  2
for 1272  and  4989 and  1  attemted_range calcualted with value:  1  :  1
for 1739  an

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [189]:
#Part 8- Use F1 Score to calculate the model accuracy

from sklearn import metrics

accuracy_score = metrics.accuracy_score(test1_df['predicted_rating'], test1_df['attempts_range'])
precision_score = metrics.precision_score(test1_df['predicted_rating'], test1_df['attempts_range'], average='weighted')
recall_score = metrics.recall_score(test1_df['predicted_rating'], test1_df['attempts_range'], average='weighted')
f1_score = metrics.f1_score(test1_df['predicted_rating'], test1_df['attempts_range'], average='weighted')

print('accuracy_score: ', accuracy_score)
print('precision_score: ', precision_score)
print('recall_score: ', recall_score)
print('f1_score:: ', f1_score)

accuracy_score:  0.34804726488296467
precision_score:  0.8289046374634128
recall_score:  0.34804726488296467
f1_score::  0.4361536838382517


In [54]:
#Part-9 -  Load the test data against which model is going to be vvalidated

test = pd.read_csv('analytics_vidhya_recommendataion_engine_data/test_submissions_NeDLEvX.csv', sep=',', error_bad_lines=False, encoding="latin-1")
sample_submission = pd.read_csv('analytics_vidhya_recommendataion_engine_data/sample_submissions_wbscxqU.csv', sep=',', error_bad_lines=False, encoding="latin-1")

test['problem_id'] = le_problem_id.transform(test['problem_id'])
test['user_id'] = le_user_id.transform(test['user_id'])

sample_submission['ID'] = test['ID']

In [55]:
#Part-10 -  Use knnbaseline_user algorithm for prediction and export the result into CSV file
predicted_attempt_range = []
for index, row in test.iterrows():
    attempt_range = get_recommendations_knnbaseline_user(row['user_id'], row['problem_id'])
    predicted_attempt_range.append(attempt_range)

sample_submission['attempts_range'] = predicted_attempt_range
export_csv = sample_submission.to_csv (r'test_predictions_knnbaseline_user_75_1.csv', sep=',', index = None, header=True)