# AIDM7380 Group Project - Recommender System for eLearning

# Data Exploration and Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go

## Load dataset

In [None]:
elearning = pd.read_csv("/elearning_dataset1.csv")
elearning

Unnamed: 0,Date,UserID,CourseID,Event
0,2022-01-10 10:12:35,440,247,view_course
1,2022-01-10 10:22:28,1186,338,view_course
2,2022-01-10 10:23:13,1186,338,detailed_description
3,2022-01-10 10:42:18,703,238,view_course
4,2022-01-10 10:51:02,225,350,view_course
...,...,...,...,...
17360,2022-03-09 15:09:38,1081,245,view_course
17361,2022-03-09 15:11:02,374,146,detailed_description
17362,2022-03-09 15:12:03,1081,245,teacher_profile
17363,2022-03-09 15:20:15,1142,231,view_course


## Check the dataset

In [None]:
users = elearning.UserID.unique()
content = elearning.CourseID.unique()
events = elearning.Event.unique()
print(f"There are {len(users)} unique users, {len(content)} unique content and {len(events)} events in the dataset.")

There are 1591 unique users, 396 unique content and 5 events in the dataset.


In [None]:
elearning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17365 entries, 0 to 17364
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      17365 non-null  object
 1   UserID    17365 non-null  int64 
 2   CourseID  17365 non-null  int64 
 3   Event     17365 non-null  object
dtypes: int64(2), object(2)
memory usage: 542.8+ KB


In [None]:
elearning.Event.unique()

array(['view_course', 'detailed_description', 'teacher_profile',
       'rundown', 'institution'], dtype=object)

In [None]:
elearning.Event.describe()

count           17365
unique              5
top       view_course
freq             8874
Name: Event, dtype: object

There are five different user behaviors in this dataset, "view_course", "detailed_description", "teacher_profile", "teacher_profile", "rundown", and "institution", with the most "view_course" 8874 times.

In [None]:
# Calculate the count of each behavior using value_counts
behavior_counts = elearning['Event'].value_counts()

# Create pie chart data
data = go.Pie(labels=behavior_counts.index, values=behavior_counts.values)

# Create layout
layout = go.Layout(title='Event Counts by Behavior')

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

"view_course" accounted for 51.1% of all user behaviors, followed by "rundown" at 15.4%.

In [None]:
# Calculate the count of events for each CourseID using value_counts
course_event_counts = elearning['CourseID'].value_counts()

# Find the CourseID with the most events and its corresponding event count
most_event_course_id = course_event_counts.idxmax()
most_event_count = course_event_counts.max()

# Display the results
print(f"CourseID with the most events: {most_event_course_id}")
print(f"Event count: {most_event_count}")

CourseID with the most events: 202
Event count: 101


In [None]:
# Calculate the count of each CourseID using value_counts
course_counts = elearning['CourseID'].value_counts()

# Create table data
data = go.Table(
    header=dict(values=['CourseID', 'Event Counts']),
    cells=dict(values=[course_counts.index, course_counts.values])
)

# Create layout
layout = go.Layout(title='CourseID Counts')

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

The top 10 courses have user behaviors that all exceed 80 times, with the course id with the highest number of behaviors being 202.

In [None]:
# Filter the dataset for events with 'view_course' and group by CourseID to count the views
course_views = elearning[elearning['Event'] == 'view_course'].groupby('CourseID').size()

# Sort the views in descending order and select the top 10 courses
top_10_courses = course_views.sort_values(ascending=False).head(10)

# Create a DataFrame with the Course ID and the number of views
df = pd.DataFrame({'Course ID': top_10_courses.index, 'Number of Views': top_10_courses.values})

# Create an interactive table
fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)),
                               cells=dict(values=[df['Course ID'], df['Number of Views']]))])

# Set the table title
fig.update_layout(title="Top 10 Courses with Highest Views")

# Show the table
fig.show()

In [None]:
# Group the dataset by UserID and count the number of unique courses viewed by each user
user_course_count = elearning.groupby('UserID')['CourseID'].nunique()

# Sort the users by the number of unique courses viewed in descending order and select the top 10 users
top_10_users = user_course_count.sort_values(ascending=False).head(10)

# Create a DataFrame with the User ID and the number of unique courses viewed
df = pd.DataFrame({'User ID': top_10_users.index, 'Number of Unique Courses': top_10_users.values})

# Create an interactive table
fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)),
                               cells=dict(values=[df['User ID'], df['Number of Unique Courses']]))])

# Set the table title
fig.update_layout(title="Top 10 Users with Most Unique Course Views")

# Show the table
fig.show()

Users with user IDs 429, 1040, and 347 all viewed 14 different courses.

In [None]:
# Calculate the count of each UserID using value_counts
userid_counts = elearning['UserID'].value_counts()

# Create table data
data = go.Table(
    header=dict(values=['UserID', 'Count']),
    cells=dict(values=[userid_counts.index, userid_counts.values])
)

# Create layout
layout = go.Layout(title='UserID Counts')

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

User 1040 has 35 times user behaviors in this dataset, the most of any user.

In [None]:
# Calculate the count of each UserID using value_counts
userid_counts = elearning['UserID'].value_counts()

# Create histogram data
data = go.Histogram(x=userid_counts.values)

# Create layout
layout = go.Layout(title='User Behavior Counts Distribution', xaxis=dict(title='Counts'), yaxis=dict(title='Frequency'))

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

As can be seen from the distribution of counts in the above graph, most users have a number of behaviors between 5 and 20.

# User-based Collaborative Filtering

## Implicit Ratings

In [None]:
# Create a user-item binary matrix
uiMatrix = pd.DataFrame(columns=content, index=users)
uiMatrix.head()

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
440,,,,,,,,,,,...,,,,,,,,,,
1186,,,,,,,,,,,...,,,,,,,,,,
703,,,,,,,,,,,...,,,,,,,,,,
225,,,,,,,,,,,...,,,,,,,,,,
1334,,,,,,,,,,,...,,,,,,,,,,


## Type of events and weights

In [None]:
eventTypes = elearning.Event.unique()
print(eventTypes)

['view_course' 'detailed_description' 'teacher_profile' 'rundown'
 'institution']


In [None]:
eventWeights = {
    'view_course': 5,
    'detailed_description': 15,
    'teacher_profile': 50,
    'institution': 50,
    'rundown': 80}

## Add Decay
Using the formula introduced during lecture

$${IRDecay}_{(i,u)} = \sum_{i=1}^n w_i*{\#event}_i*d\left({\#event}_i\right) = \left(w_1*{\#event}_1*d\left({\#event}_1\right)\right)+\left(w_2*{\#event}_2*d\left({\#event}_2\right)\right)+\dots+\left(w_n*{\#event}_n*d\left({\#event}_n\right)\right)$$

In [None]:
#Computing decay
import datetime
from datetime import date, timedelta, datetime

def compute_decay(eventDate, decayDays):
    age = (date.today() - datetime.strptime(eventDate, '%Y-%m-%d %H:%M:%S').date()) // timedelta(days=decayDays)
    #Converts the given event date string into a datetime object.Extract the date part of a datetime object.
    #print("Age of event:", age)
    decay = 1/age #simple decay
    #print("Decay factor:", decay)

    return decay

createdEvent = elearning.at[0,'Date']
thresholdDays = 2 # Number of days
decayFactor = compute_decay(createdEvent, thresholdDays)

print(decayFactor)

0.0024271844660194173


In [None]:
#Compute the Implicit Rating for each user-item combination. Populate the user-item matrix
#Add the decay factor
for index, row in elearning.iterrows():
    currentUser = row['UserID']
    currentContent = row['CourseID']

    w = eventWeights[row['Event']] * compute_decay(row['Date'],thresholdDays)

    currentValue = uiMatrix.at[currentUser, currentContent]
    if np.isnan(currentValue):
        currentValue = 0

    updatedValue = currentValue + w #+ (1 * w)
    uiMatrix.at[currentUser, currentContent] = updatedValue

uiMatrix.head()

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
440,0.012136,,,,,,,,,,...,,,,,,,,,,
1186,,0.048544,,,,,,,,,...,,,,,,,,,,
703,,,0.012136,,,,,,,,...,,,,,0.137157,,,,,
225,,,,0.012136,,,,,,,...,,,,,,,,,,
1334,,,,,0.012136,,,,,,...,,,,0.140665,,,,,,


## Normalization

In [None]:
#Update the user-item matrix by normalizing the values between 0 and 10.
uiMatrixNorm = uiMatrix.apply(
    lambda x: ((x - np.nanmin(uiMatrix.values))/(np.nanmax(uiMatrix.values) - np.nanmin(uiMatrix.values)))*10
    )

uiMatrixNorm.head()

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
440,0.0,,,,,,,,,,...,,,,,,,,,,
1186,,0.556239,,,,,,,,,...,,,,,,,,,,
703,,,0.0,,,,,,,,...,,,,,1.910077,,,,,
225,,,,0.0,,,,,,,...,,,,,,,,,,
1334,,,,,0.0,,,,,,...,,,,1.96367,,,,,,


## User-based Function
Step1: Compute Similarity between the active user and the rest of the users

Step2: Select the items whose values are to be predicted for current user

Step3: Compute the predicted ratings for current user.
Using mean value of the other high-similarity users

In [None]:
from scipy.spatial.distance import jaccard, cosine
from scipy.stats import spearmanr

def userCF_prediction(df, currentUser, numUsers, numItems, similarity='pearson',displayed = True):#display means printing outcomes or not
    df = df.astype(float)
    df_copy = df.copy()
    cuDf = df_copy.loc[currentUser]

    if similarity == 'pearson':
        similarityDf = df_copy.corrwith(cuDf, axis=1, method='pearson')
    elif similarity == 'jaccard':
        similarityDf = df_copy.apply(lambda row: 1 - jaccard(cuDf, row), axis=1)
    elif similarity == 'cosine':
        similarityDf = df_copy.apply(lambda row: 1 - cosine(cuDf, row), axis=1)
    elif similarity == 'spearman':
        similarityDf = df_copy.apply(lambda row: spearmanr(cuDf, row).correlation, axis=1)
    else:
        raise ValueError("Invalid similarity measure. Available options are 'pearson', 'jaccard', 'cosine', and 'spearman'.")

    similarityDf.sort_values(ascending=False, inplace=True)
    similarityDf.drop(labels=[currentUser], inplace=True)
    similarityDf = similarityDf.head(numUsers)

    toPredict = cuDf[cuDf.isna()]

    ratings = df_copy.loc[similarityDf.index]
    ratingsToPredict = ratings[toPredict.index]

    predictedRatings = ratingsToPredict.mean()
    predictedRatings.sort_values(ascending=False, inplace=True)
    if displayed:
        display(predictedRatings.head(numItems))
    return predictedRatings.head(numItems)

In [None]:
#Find one without all the ratings already filled to recommend something for him
uiMatrixNorm.isna().sum(axis=1)

440     391
1186    390
703     392
225     390
1334    391
       ... 
659     395
66      395
612     395
483     395
956     395
Length: 1591, dtype: int64

### Pearson Showcase

In [None]:
recommendList1 = userCF_prediction(uiMatrixNorm,440,200,5,'pearson')


Degrees of freedom <= 0 for slice


divide by zero encountered in divide



326    6.742123
259    5.631095
236    5.023005
196    5.023005
186    4.958062
dtype: float64

In [None]:
recommendList2 = userCF_prediction(uiMatrixNorm,823,100,5,'pearson')


Degrees of freedom <= 0 for slice


divide by zero encountered in divide



359    5.616369
79     5.445309
42     5.062764
236    5.023005
33     4.996832
dtype: float64

### Spearman Showcase

In [None]:
recommendList3 = userCF_prediction(uiMatrixNorm,440,200,5,'spearman')

326    6.742123
259    5.631095
236    5.023005
196    5.023005
186    4.958062
dtype: float64

In [None]:
recommendList4 = userCF_prediction(uiMatrixNorm,823,100,5,'spearman')

359    5.616369
79     5.445309
42     5.062764
236    5.023005
33     4.996832
dtype: float64

In [None]:
#Delete NaN values and return new list
def process_recommendations(recommendations,displayed=True):
    filtered_series = recommendations.dropna()
    if(displayed):
        print('CurrentLength:'+str(len(filtered_series)))
        display(filtered_series)
    return filtered_series

## Example

In [None]:
recommendListExample = userCF_prediction(uiMatrixNorm,2,2,50,'spearman')
filtered_series = process_recommendations(recommendListExample)

248    4.970921
323    3.540935
381    2.997509
353    0.590121
181    0.582327
114    0.578488
386    0.558043
338    0.556239
106    0.556239
384    0.000451
247    0.000000
238         NaN
350         NaN
264         NaN
139         NaN
9           NaN
20          NaN
8           NaN
288         NaN
373         NaN
298         NaN
180         NaN
224         NaN
150         NaN
30          NaN
3           NaN
284         NaN
235         NaN
261         NaN
117         NaN
177         NaN
48          NaN
90          NaN
291         NaN
119         NaN
362         NaN
252         NaN
136         NaN
255         NaN
321         NaN
258         NaN
138         NaN
374         NaN
360         NaN
98          NaN
86          NaN
2           NaN
349         NaN
201         NaN
233         NaN
dtype: float64

CurrentLength:11


248    4.970921
323    3.540935
381    2.997509
353    0.590121
181    0.582327
114    0.578488
386    0.558043
338    0.556239
106    0.556239
384    0.000451
247    0.000000
dtype: float64

# Item-based Collaborative Filtering

Use the cosine distance in scipy library to compute the similarity.

$$cosine\_similarity = 1 - cosine\_distance$$

In [None]:
from scipy.spatial.distance import cosine

def cosine_sim(df1, df2):
    # check for na in dataframes
    df1na = df1.isna()
    df1clean = df1[~df1na]
    df2clean = df2[~df1na]

    df2na = df2clean.isna()
    df1clean = df1clean[~df2na]
    df2clean = df2clean[~df2na]


    # Compute cosine similarity
    distance = cosine(df1clean, df2clean)
    sim = 1 - distance

    return sim

## Precomputing similarities

A utility function to convert from numeric to boolean, indicating if the user rated an item.

In [None]:
def to_bool(value):
    if np.isnan(value):
        return 0
    else:
        return 1

Compute the number of overlapping rating between each item

In [None]:
uiMatrixBool = uiMatrix.applymap(lambda x: to_bool(x))
overlappingUsersRatings = uiMatrixBool.T.dot(uiMatrixBool)

In [None]:
display(overlappingUsersRatings)

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
247,45,1,0,0,2,0,2,0,0,0,...,0,0,0,1,2,2,0,1,0,1
338,1,33,0,1,2,2,3,0,1,1,...,0,0,0,0,0,0,0,0,0,0
238,0,0,21,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
350,0,1,0,28,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
264,2,2,0,0,29,0,0,0,0,1,...,0,0,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,7,0,0,0,0
50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
306,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
199,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,0


Check min and max

In [None]:
overlappingUsersRatings.max(axis=0).max()

48

In [None]:
overlappingUsersRatings.min(axis=0).min()

0

In [None]:
toDrop = overlappingUsersRatings.max(axis=0) > 10
display(toDrop)

247     True
338     True
238     True
350     True
264     True
       ...  
89     False
50     False
306    False
199    False
318    False
Length: 396, dtype: bool

In [None]:
selectedItems = overlappingUsersRatings.loc[toDrop, toDrop]
display(selectedItems)

Unnamed: 0,247,338,238,350,264,139,9,20,288,373,...,300,27,167,290,76,28,23,121,128,100
247,45,1,0,0,2,0,2,0,0,0,...,1,0,1,0,0,1,0,1,1,2
338,1,33,0,1,2,2,3,0,1,0,...,2,0,0,1,0,0,0,0,0,0
238,0,0,21,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
350,0,1,0,28,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
264,2,2,0,0,29,0,0,0,1,0,...,0,0,1,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,14,1,0,0,0
23,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,19,0,0,0
121,1,0,0,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,22,0,0
128,1,0,0,0,0,1,1,1,0,1,...,0,0,1,0,0,0,0,0,12,1


Compute item-item similarity

In [None]:
selectedIndex = selectedItems.index
uiMatrixSelection = uiMatrix[selectedIndex]
uiMatrixSelection = uiMatrixSelection.sub(uiMatrixSelection.mean(axis=1), axis=0)
iiSimMatrix = pd.DataFrame().reindex_like(selectedItems)
for item1 in selectedIndex:
    item1Ratings = uiMatrixSelection[item1]
    for item2 in selectedIndex:
        item2Ratings = uiMatrixSelection[item2]
        if overlappingUsersRatings.at[item1, item2] > 1:
          iiSimMatrix.at[item1, item2] = cosine_sim(item1Ratings, item2Ratings)


In [None]:
display(iiSimMatrix)

Unnamed: 0,247,338,238,350,264,139,9,20,288,373,...,300,27,167,290,76,28,23,121,128,100
247,1.000000,,,,0.902205,,0.555390,,,,...,,,,,,,,,,-0.874061
338,,1.000000,,,0.346231,-0.859918,-0.588277,,,,...,-0.105249,,,,,,,,,
238,,,1.0,,,,,,,,...,,,,,,,,,,
350,,,,1.0,,,,,,,...,,,,,,,,,,
264,0.902205,0.346231,,,1.000000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,,,,,,,,,,,...,,,,,,1.0,,,,
23,,,,,,,,,,,...,,,,,,,1.0,,,
121,,,,,,,,,,,...,,,,,,,,1.0,,
128,,,,,,,,,,,...,,,,,,,,,1.0,


## Item score predict function
based on precomputed similarity matrix

In [None]:
def item_predict(df, currentUser, currentItem, simMatrix, k):
    # df: the UI matrix dataframe ; currentUser currentItem: the current user and item for whom the prediction is to be made
    # simMatrix: the precomputed simMatrix ; k: for top-k

    uiMatrixNorm = df.astype(float)
    uiMatrixSelection = uiMatrixNorm.dropna(subset = [currentItem])
    cuAvgRating = uiMatrixNorm.loc[[currentUser]].dropna(axis=1).mean(axis=1)

    # Subtract the average rating of each user from that user's rating to eliminate the effect of different rating scales for different users
    uiMatrixSelection = uiMatrixSelection.sub(uiMatrixSelection.mean(axis=1), axis=0)
    uiMatrixSelection.drop(columns=[currentItem], inplace=True)
    iiSimilarity = simMatrix.loc[currentItem].dropna().sort_values(ascending=False).drop(index=currentItem)

    # Select the top k items with the highest similarity
    itemsToCompare = uiMatrixSelection[iiSimilarity.head(k).index]
    predictedRatings = itemsToCompare.mean(axis=0)

    predictedRatings.sort_values(ascending=False, inplace=True)

    # Add the average user rating back to the predicted rating
    predictedRatings += cuAvgRating.iloc[0]


    # Record the lowest similarity amongst the items used
    if k < iiSimilarity.shape[0]:
        minSim = iiSimilarity.iloc[k-1]
    else:
        minSim  = -1

    # The average of all predicted ratings is used as the final predicted rating of the current user for the current item
    return predictedRatings.mean(), minSim


## Item-based Recommand Function


In [None]:
def itemCF_recommend(df, currentUser, simMatrix, numItems, k, minSimthreshold):
    # df: the UI matrix dataframe ; currentUser: the current user for whom the recommand is to be made
    # simMatrix: the precomputed simMatrix ; numItems: the number of items to be recommanded ; k: for top-k

    # Get items not rated by the current user
    cuRatedItems = df.loc[currentUser].dropna().sort_values(ascending=False)
    unRated = df.loc[currentUser]
    unRated.drop(labels=cuRatedItems.index, inplace=True)

    toRec = []
    # Iterate through the unrated items and call the `item_predict` function for each item to predict the rating
    for item in unRated.index:
        if item in simMatrix.index:
            preRate, minSim = item_predict(df, currentUser, item, simMatrix,k)
            if not np.isnan(preRate) and minSim >= minSimthreshold:
                toRec.append((preRate, item, minSim))
    sortedRec = sorted(toRec, key=lambda x: x[0], reverse=True)
    return sortedRec[0:numItems]


## Tests and parameter change tests

Recommand for user 429, with numItems is 10 and k is 5 , minSimthreshold is 0.6:

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,429,iiSimMatrix,10,5,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
162	2.654818613767521	0.9609996962498509
317	2.5993519877096416	0.6528359061495199
217	2.5933097657623305	0.8188089177299935
124	2.564047002728453	0.931154850088241
379	2.5486536522059207	0.8943577312901986
337	2.541343820146858	0.6532280304633893
139	2.516880991109265	0.9020334033862373
11	2.4234337132705646	0.7816435344775645
59	2.3884875422200325	0.6409202286054475
86	2.3751285151444588	0.8085097552821704


Recommand for user 429, with numItems is 10 and k is 10 , minSimthreshold is 0.6:

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,429,iiSimMatrix,10,10,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
45	2.392593543406252	0.6035905441647087
198	2.301638232708035	0.7381851111439544
153	2.1062507619850965	0.8915042261867064
231	1.9967842609566198	0.6143774978325294
253	1.9769320062522895	0.8009120556372227
46	1.9695188318374792	0.6022221458652042
282	1.9113538273976967	0.70547788319567
193	1.8756543719913055	0.6539562088879508
10	1.87030522338706	0.6113039797752168
139	1.8459398166794796	0.6433277098351865


Recommand for user 429, with numItems is 10 and k is 5 , minSimthreshold is 0.8:

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,429,iiSimMatrix,10,5,0.8)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
162	2.654818613767521	0.9609996962498509
217	2.5933097657623305	0.8188089177299935
124	2.564047002728453	0.931154850088241
379	2.5486536522059207	0.8943577312901986
139	2.516880991109265	0.9020334033862373
86	2.3751285151444588	0.8085097552821704
120	2.331642160376364	0.8002233274839735
26	2.317104264027816	0.8908139653497597
45	2.231904874304911	0.8602512101152433
85	2.2045360707825545	0.8835634740227933


Recommand for user 347, with numItems is 10 and k is 5 , minSimthreshold is 0.6:

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,347,iiSimMatrix,10,5,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
32	2.393298975992794	0.6237609631097152
162	2.1981512941208257	0.9609996962498509
317	2.1426846680629463	0.6528359061495199
217	2.136642446115635	0.8188089177299935
124	2.1073796830817573	0.931154850088241
379	2.091986332559225	0.8943577312901986
337	2.0846765005001617	0.6532280304633893
139	2.060213671462569	0.9020334033862373
346	1.9838007385365237	0.7243771166850372
11	1.9667663936238686	0.7816435344775645


Recommand for user 347, with numItems is 10 and k is 10 , minSimthreshold is 0.6:

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,347,iiSimMatrix,10,10,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
198	1.844970913061339	0.7381851111439544
153	1.649583442338401	0.8915042261867064
231	1.5401169413099243	0.6143774978325294
253	1.5202646866055933	0.8009120556372227
46	1.5128515121907837	0.6022221458652042
282	1.4546865077510012	0.70547788319567
193	1.4189870523446095	0.6539562088879508
10	1.4136379037403641	0.6113039797752168
139	1.389272497032784	0.6433277098351865
368	1.36838683445198	0.6974242431416564


Recommand for user 347, with numItems is 10 and k is 5 , minSimthreshold is 0.8:

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,347,iiSimMatrix,10,5,0.8)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
162	2.1981512941208257	0.9609996962498509
217	2.136642446115635	0.8188089177299935
124	2.1073796830817573	0.931154850088241
379	2.091986332559225	0.8943577312901986
139	2.060213671462569	0.9020334033862373
86	1.9184611954977633	0.8085097552821704
26	1.8604369443811208	0.8908139653497597
85	1.747868751135859	0.8835634740227933
287	1.650552906988509	0.9593800718962435
198	1.629403071864592	0.9113300272448243


# User Coverage


In [None]:
def get_recommended_users(df,numUsers,numItems,similarity='pearson', displayed=False):

    users = df.index.unique()

    recommended_users = []

    for user_id in users:
        recommendations = userCF_prediction(df,user_id,numUsers,numItems,similarity,displayed)
        filtered_series = process_recommendations(recommendations,displayed)
        if len(filtered_series) >= numItems:
            recommended_users.append(user_id)

    user_coverage = len(recommended_users) / len(users)
    print("recommended_users:")
    display(recommended_users)
    print("user_coverage:")
    display(user_coverage)
    return recommended_users, user_coverage

In [None]:
recommended_users, user_coverage=get_recommended_users(uiMatrixNorm,3,12,'spearman',False)#df,numUser,numItem,similarity function, display outcome or not

recommended_users:


[440,
 1186,
 703,
 225,
 1334,
 777,
 651,
 1467,
 1077,
 1215,
 1225,
 1401,
 758,
 988,
 1493,
 1149,
 195,
 1598,
 833,
 1555,
 1528,
 1155,
 200,
 352,
 245,
 1239,
 767,
 253,
 691,
 77,
 434,
 339,
 75,
 1449,
 897,
 566,
 222,
 974,
 311,
 1301,
 494,
 161,
 724,
 914,
 992,
 854,
 1358,
 529,
 99,
 532,
 1473,
 583,
 233,
 501,
 595,
 1368,
 706,
 565,
 1020,
 675,
 180,
 879,
 1092,
 538,
 904,
 1346,
 468,
 488,
 1549,
 587,
 1198,
 102,
 272,
 1081,
 1185,
 288,
 302,
 118,
 798,
 862,
 260,
 1369,
 867,
 620,
 460,
 528,
 328,
 888,
 1264,
 357,
 401,
 73,
 422,
 314,
 1138,
 950,
 940,
 1269,
 1546,
 577,
 382,
 836,
 145,
 209,
 1104,
 217,
 907,
 838,
 353,
 775,
 387,
 95,
 1139,
 1122,
 156,
 1267,
 820,
 1419,
 1595,
 208,
 951,
 1004,
 628,
 154,
 1307,
 234,
 406,
 263,
 649,
 105,
 685,
 948,
 844,
 544,
 348,
 86,
 0,
 482,
 523,
 1070,
 384,
 1075,
 456,
 822,
 1295,
 33,
 45,
 1509,
 1360,
 269,
 416,
 786,
 1270,
 9,
 700,
 1472,
 210,
 1262,
 811,
 878,
 1010

user_coverage:


0.9993714644877436

In [None]:
recommended_users, user_coverage=get_recommended_users(uiMatrixNorm，20,100，'spearman',False)

In [None]:
recommended_users, user_coverage=get_recommended_users(uiMatrixNorm,100,20，'spearman',False)

In [None]:
recommended_users, user_coverage=get_recommended_users(uiMatrixNorm,4,20,'spearman',False)

recommended_users:


[703]

user_coverage:


0.0006285355122564425

In [None]:
recommended_users, user_coverage=get_recommended_users(uiMatrixNorm,3,200,'spearman',False)

recommended_users:


[]

user_coverage:


0.0

In [None]:
recommended_users, user_coverage=get_recommended_users(uiMatrixNorm,400,2,'spearman',False)

recommended_users:


[440,
 1186,
 703,
 225,
 1334,
 777,
 651,
 1467,
 1077,
 1215,
 1225,
 1401,
 758,
 988,
 1493,
 1149,
 195,
 1598,
 833,
 1555,
 1528,
 1155,
 200,
 352,
 245,
 1239,
 767,
 253,
 691,
 77,
 434,
 339,
 75,
 1449,
 897,
 566,
 222,
 974,
 311,
 1301,
 494,
 161,
 724,
 914,
 992,
 854,
 1358,
 529,
 99,
 532,
 1473,
 583,
 233,
 501,
 595,
 1368,
 706,
 565,
 1020,
 675,
 180,
 879,
 1092,
 538,
 904,
 1346,
 468,
 488,
 1549,
 587,
 1198,
 102,
 272,
 1081,
 1185,
 288,
 302,
 118,
 798,
 862,
 260,
 1369,
 867,
 620,
 460,
 528,
 328,
 888,
 1264,
 357,
 401,
 73,
 422,
 314,
 1138,
 950,
 940,
 1269,
 1546,
 577,
 382,
 836,
 145,
 209,
 1104,
 217,
 907,
 838,
 353,
 775,
 387,
 95,
 1139,
 1122,
 156,
 1267,
 820,
 1419,
 1595,
 208,
 951,
 1004,
 628,
 154,
 1307,
 234,
 406,
 263,
 649,
 105,
 685,
 948,
 844,
 544,
 348,
 86,
 0,
 482,
 523,
 1070,
 384,
 1075,
 456,
 822,
 1295,
 33,
 45,
 1509,
 1360,
 269,
 416,
 786,
 1270,
 9,
 700,
 1472,
 210,
 1262,
 811,
 878,
 1010

user_coverage:


1.0


By calling the get_recommended_users function, we mainly picked two of the parameters to be changed, which are numUsers, numItems, for similarity, the difference between the referenced functions is small, so we don't test the tuning separately. According to the beginning of the data analysis, we can know that there are 1591 unique users and 396 unique content in the dataset, so we modify the number of numUsers and numItems to calculate the coverage data.


First of all, when numUsers is 3 and numItems is 12, the user coverage is 0.9993714644877436. When the data is modified so that there is a big difference between the number of numUsers and numItems, for example, when numUsers is 20 and numItems is 100, we can see that user coverage data is 0.9993714644877436, while when numUsers is 4,numItems is 20, user coverage is 0.0006285355122564425, and when numUsers is 3,numItems is 200, user coverage is 0. This may mean that only a very small number of users are referred to, but the number of courses that need to be recommended is too large, so the coverage rate is basically 0. Alternatively, after adjusting numUsers to 100 and numItems to 20, we can see that the user coverage data is 1.0, and user coverage is 1.0 when numUsers is 400,numItems is 2. We can see that the coverage is very high, that is, basically all users can be covered recommended. According to the above data, we can see that the recommendation algorithm can be adjusted according to different parameters to present different user coverage, very flexible. And when the number of reference users is large and the recommended courses are small, the user coverage rate will be very high.






## Challenges

It is also worth noting that before getting the current better results, the user coverage we calculated was always 0 or 1, and to some extent, the recommendation was invalid. Later, through exploration, we found that it is because when defining the recommendation function, we did not remove the value of NaN from the final recommendation result, so that when we ask for the output result, there will be a corresponding number of recommendations, and at this time the user coverage will always be 1, which is actually inaccurate recommendation results.

# Catalogue coverage

In [None]:
# Create an empty list to store recommended courses
recommended_items = []

# Store recommended courses
recommended_items_set = set()

# Iterate through each user ID
for user_id in users:
    # Call function
    recommendations = itemCF_recommend(uiMatrixNorm, user_id, iiSimMatrix, 5,5)

    # Add recommended courses to lists and collections
    recommended_items.extend(recommendations)
    recommended_items_set.update(recommendations)

# Remove duplicates and convert to a list
recommended_items = list(recommended_items_set)

# Calculate directory coverage
catalogue_coverage = len(recommended_items) / len(content)

In determining the threshold, we have selected more appropriate overlapping ratings, which means that overlappingUsersRatings have been determined to be in a very effective range, meaning that a lot of courses that rarely appear at the same time as other courses are filtered out, which can be seen later in the tuning, but it also means that the catalog coverage of the recommended courses is low . The average time spent on tuning by traversing user IDs is huge, so the performance of catalog coverage can be evaluated later in the choice of k-values.



# Tune
Tune the neighbourhood size (i.e. the k parameter) and compare the neighbourhood selection strategies (i.e. threshold and top-k), motivating the parameter choices to display how you can achieve better results.

#Tuning for user

According to the previous USER COVERAGE data we can see that when the first k data of the reference user and the first k data of the recommended courses are adjusted, there is the following pattern. When the reference user's data is less and the recommended courses are more, the recommendation effect is worse, when the number difference between the two parameters is larger and the reference user's data is more and the recommended courses are less, the recommendation effect will be better, the coverage rate is between 0.9 and 1. That is to say, numUsers takes a larger value, for example, 100 to 300 or so, numItems takes a smaller value, in 10 to 30 or so, is a more appropriate value.









## Challenges


It is also worth noting that in the recommendation there will be some data displayed as 0 for users, in fact, the decimal point is very close to the back is not displayed, but in order to ensure the integrity of the recommendation, we still retain this part of the data.

# Tuning for item

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,988,iiSimMatrix,10,5,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
32	2.8458748437388386	0.6237609631097152
162	2.6507271618668695	0.9609996962498509
317	2.59526053580899	0.6528359061495199
217	2.5892183138616796	0.8188089177299935
124	2.5599555508278007	0.931154850088241
379	2.5445622003052693	0.8943577312901986
337	2.5372523682462065	0.6532280304633893
139	2.5127895392086135	0.9020334033862373
346	2.4363766062825687	0.7243771166850372
11	2.4193422613699127	0.7816435344775645


In [None]:
recSer = itemCF_recommend(uiMatrixNorm,988,iiSimMatrix,60,10,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
45	2.3885020915056	0.6035905441647087
198	2.2975467808073837	0.7381851111439544
153	2.1021593100844456	0.8915042261867064
231	1.9926928090559688	0.6143774978325294
253	1.9728405543516376	0.8009120556372227
46	1.965427379936828	0.6022221458652042
282	1.9072623754970455	0.70547788319567
193	1.871562920090654	0.6539562088879508
10	1.8662137714864084	0.6113039797752168
139	1.8418483647788286	0.6433277098351865
368	1.8209627021980244	0.6974242431416564
393	1.8195854509436356	0.8689409559937289
26	1.7902506454792868	0.7158981892514547
287	1.7460736346631767	0.7582342727810052
335	1.7338357592823446	0.6492299691001352
269	1.6241874619878902	0.7373830758452711
204	1.610809685241108	0.6275827815474267
164	1.5781673467429758	0.752962733981712
161	1.550161869416406	0.8035677575812904
160	1.5236808605366763	0.674948107915687
203	1.52342180388848	0.7933349643125656
357	1.4868211915867804	0.742655086135042
289	1.485054004305223	0.6753420984082716
385	1.44913748330485

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,988,iiSimMatrix,30,8,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
198	2.404434843170697	0.8297266411186994
379	2.2709431660847805	0.7373830758452711
45	2.232547955152865	0.6675403962514402
37	2.20678211412074	0.738575797699388
124	2.143103872456387	0.7658075968534651
85	2.072268402313326	0.7167960275795403
302	1.992623824611322	0.6098373756966002
139	1.9865478444494418	0.7692159864922617
287	1.9834903274098579	0.8425435872777952
26	1.9529493648745697	0.8404195734240605
30	1.9470922720218293	0.6791444632790699
46	1.9469970219047044	0.6675403962514402
84	1.941379588359587	0.7278617235917856
231	1.9286557117353351	0.6885826965481375
282	1.8879699813867354	0.919659526235609
153	1.8137993605461777	0.9535072996666663
313	1.8114869578406991	0.6059365287179177
269	1.7904186620496847	0.8519130057612078
253	1.78240527085045	0.8667184925775605
39	1.7795207596734621	0.7755335351651436
77	1.7742379904395227	0.6178633499096698
204	1.725096351589685	0.7167960275795403
342	1.7063377926062189	0.7605150784799117
10	1.7055487136781053	0

In [None]:
recSer = itemCF_recommend(uiMatrixNorm,988,iiSimMatrix,20,15,0.6)
print('Item\tPredictrating\tMinSimilarity')
for item in recSer:
    print(str(item[1])+'\t'+str(item[0])+'\t'+str(item[2]))

Item	Predictrating	MinSimilarity
198	2.227167166564799	0.6363934897620549
153	2.1452240968862166	0.6588661084612685


For item, according to the data of the definition function, we can see that, because in the overlap scoring link, the maximum overlap is 48, the minimum overlap is 0, we choose the overlap scoring greater than 10, which ensures that there will not be too many irrelevant courses, but also ensures that the final recommended range will not be too narrow. Compared with the value of 5 and the value of 20, 10 is a relatively appropriate choice.



The value of minimum similarity ranges from -1 to 1. We choose 0.6 as the minimum value of minimum similarity, while 0.8 can be taken as a more similar value. As for the value of k, when numItems is 10, k is 5, and the minimum similarity is 0.6, it can be seen that there will be 10 effective course recommendations, and the minimum similarity in the ratings is between 0.62-0.93; when numItems is 60, k is 10, there will be 34 course recommendations, and the minimum similarity focuses the data mainly in the 0.61-0.8 range. When numItems is 30, k is 8, there will be 30 effective course recommendations, and the minimum similarity between 0.7-0.9; when numItems is 20, k is 15, there will be 2 effective course recommendations, but at this time, the minimum similarity is low, and the output is very little, that is to say, the value of k will have a more significant impact on the recommended results.

In summary, the value of numItems does not have a great impact on the results, but should not be too high, the data in the 10-30 or so is more appropriate, the minimum similarity value of 0.6 is the lowest bottom line, the minimum similarity value of 0.8 can be more similar to the results. k value is too small will lead to the data credibility is not high, it is easy to be affected by the individual anomalies in the data. k value is too large will be mixed with a lot of data with low similarity. When we set the minimum similarity threshold will lead to a small number of recommended courses, so k value between 5-10 is an appropriate value.



## Challenges

It is worth noting that in the beginning, we did not add the parameter of minimum similarity, the recommendation performance seems to be quite good, but since the prediction score can't actually be used as the basis for the evaluation of similarity, the recommendation results are not well evaluated. So we decided to add the minimum similarity parameter, so that similarities below a certain threshold will be excluded from the recommendation results. The similarity can also be measured by setting the threshold size, at which point the recommendation is more hierarchical and personalized.