# AIDM7380 Group Project - Recommender System for eLearning

# Data Exploration and Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go

## Load dataset

In [2]:
elearning = pd.read_csv("elearning_dataset1.csv")
elearning

Unnamed: 0,Date,UserID,CourseID,Event
0,2022-01-10 10:12:35,440,247,view_course
1,2022-01-10 10:22:28,1186,338,view_course
2,2022-01-10 10:23:13,1186,338,detailed_description
3,2022-01-10 10:42:18,703,238,view_course
4,2022-01-10 10:51:02,225,350,view_course
...,...,...,...,...
17360,2022-03-09 15:09:38,1081,245,view_course
17361,2022-03-09 15:11:02,374,146,detailed_description
17362,2022-03-09 15:12:03,1081,245,teacher_profile
17363,2022-03-09 15:20:15,1142,231,view_course


## Check the dataset

In [3]:
users = elearning.UserID.unique()
content = elearning.CourseID.unique()
events = elearning.Event.unique()
print(f"There are {len(users)} unique users, {len(content)} unique content and {len(events)} events in the dataset.")

There are 1591 unique users, 396 unique content and 5 events in the dataset.


In [4]:
elearning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17365 entries, 0 to 17364
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      17365 non-null  object
 1   UserID    17365 non-null  int64 
 2   CourseID  17365 non-null  int64 
 3   Event     17365 non-null  object
dtypes: int64(2), object(2)
memory usage: 542.8+ KB


In [5]:
elearning.Event.unique()

array(['view_course', 'detailed_description', 'teacher_profile',
       'rundown', 'institution'], dtype=object)

In [6]:
elearning.Event.describe()

count           17365
unique              5
top       view_course
freq             8874
Name: Event, dtype: object

There are five different user behaviors in this dataset, "view_course", "detailed_description", "teacher_profile", "teacher_profile", "rundown", and "institution", with the most "view_course" 8874 times.

In [7]:
# Calculate the count of each behavior using value_counts
behavior_counts = elearning['Event'].value_counts()

# Create pie chart data
data = go.Pie(labels=behavior_counts.index, values=behavior_counts.values)

# Create layout
layout = go.Layout(title='Event Counts by Behavior')

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

"view_course" accounted for 51.1% of all user behaviors, followed by "rundown" at 15.4%.

In [8]:
# Calculate the count of events for each CourseID using value_counts
course_event_counts = elearning['CourseID'].value_counts()

# Find the CourseID with the most events and its corresponding event count
most_event_course_id = course_event_counts.idxmax()
most_event_count = course_event_counts.max()

# Display the results
print(f"CourseID with the most events: {most_event_course_id}")
print(f"Event count: {most_event_count}")

CourseID with the most events: 202
Event count: 101


In [9]:
# Calculate the count of each CourseID using value_counts
course_counts = elearning['CourseID'].value_counts()

# Create table data
data = go.Table(
    header=dict(values=['CourseID', 'Event Counts']),
    cells=dict(values=[course_counts.index, course_counts.values])
)

# Create layout
layout = go.Layout(title='CourseID Counts')

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

The top 10 courses have user behaviors that all exceed 80 times, with the course id with the highest number of behaviors being 202.

In [10]:
# Filter the dataset for events with 'view_course' and group by CourseID to count the views
course_views = elearning[elearning['Event'] == 'view_course'].groupby('CourseID').size()

# Sort the views in descending order and select the top 10 courses
top_10_courses = course_views.sort_values(ascending=False).head(10)

# Create a DataFrame with the Course ID and the number of views
df = pd.DataFrame({'Course ID': top_10_courses.index, 'Number of Views': top_10_courses.values})

# Create an interactive table
fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)),
                               cells=dict(values=[df['Course ID'], df['Number of Views']]))])

# Set the table title
fig.update_layout(title="Top 10 Courses with Highest Views")

# Show the table
fig.show()

In [11]:
# Group the dataset by UserID and count the number of unique courses viewed by each user
user_course_count = elearning.groupby('UserID')['CourseID'].nunique()

# Sort the users by the number of unique courses viewed in descending order and select the top 10 users
top_10_users = user_course_count.sort_values(ascending=False).head(10)

# Create a DataFrame with the User ID and the number of unique courses viewed
df = pd.DataFrame({'User ID': top_10_users.index, 'Number of Unique Courses': top_10_users.values})

# Create an interactive table
fig = go.Figure(data=[go.Table(header=dict(values=list(df.columns)),
                               cells=dict(values=[df['User ID'], df['Number of Unique Courses']]))])

# Set the table title
fig.update_layout(title="Top 10 Users with Most Unique Course Views")

# Show the table
fig.show()

Users with user IDs 429, 1040, and 347 all viewed 14 different courses.

In [12]:
# Calculate the count of each UserID using value_counts
userid_counts = elearning['UserID'].value_counts()

# Create table data
data = go.Table(
    header=dict(values=['UserID', 'Count']),
    cells=dict(values=[userid_counts.index, userid_counts.values])
)

# Create layout
layout = go.Layout(title='UserID Counts')

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

User 1040 has 35 times user behaviors in this dataset, the most of any user.

In [13]:
# Calculate the count of each UserID using value_counts
userid_counts = elearning['UserID'].value_counts()

# Create histogram data
data = go.Histogram(x=userid_counts.values)

# Create layout
layout = go.Layout(title='User Behavior Counts Distribution', xaxis=dict(title='Counts'), yaxis=dict(title='Frequency'))

# Create the chart
fig = go.Figure(data=data, layout=layout)

# Show the chart
fig.show()

As can be seen from the distribution of counts in the above graph, most users have a number of behaviors between 5 and 20.

# User-based Collaborative Filtering

## Implicit Ratings

In [14]:
# Create a user-item binary matrix
uiMatrix = pd.DataFrame(columns=content, index=users)
uiMatrix.head()

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
440,,,,,,,,,,,...,,,,,,,,,,
1186,,,,,,,,,,,...,,,,,,,,,,
703,,,,,,,,,,,...,,,,,,,,,,
225,,,,,,,,,,,...,,,,,,,,,,
1334,,,,,,,,,,,...,,,,,,,,,,


## Type of events and weights

In [15]:
eventTypes = elearning.Event.unique()
print(eventTypes)

['view_course' 'detailed_description' 'teacher_profile' 'rundown'
 'institution']


In [16]:
eventWeights = {
    'view_course': 5,
    'detailed_description': 15,
    'teacher_profile': 50,
    'institution': 50,
    'rundown': 80}

## Add Decay
Using the formula introduced during lecture

$${IRDecay}_{(i,u)} = \sum_{i=1}^n w_i*{\#event}_i*d\left({\#event}_i\right) = \left(w_1*{\#event}_1*d\left({\#event}_1\right)\right)+\left(w_2*{\#event}_2*d\left({\#event}_2\right)\right)+\dots+\left(w_n*{\#event}_n*d\left({\#event}_n\right)\right)$$

In [17]:
#Computing decay
import datetime
from datetime import date, timedelta, datetime

def compute_decay(eventDate, decayDays):
    age = (date.today() - datetime.strptime(eventDate, '%Y-%m-%d %H:%M:%S').date()) // timedelta(days=decayDays)
    #Converts the given event date string into a datetime object.Extract the date part of a datetime object.
    #print("Age of event:", age)
    decay = 1/age #simple decay
    #print("Decay factor:", decay)

    return decay

createdEvent = elearning.at[0,'Date']
thresholdDays = 2 # Number of days
decayFactor = compute_decay(createdEvent, thresholdDays)

print(decayFactor)

0.0024330900243309003


In [18]:
#Compute the Implicit Rating for each user-item combination. Populate the user-item matrix  
#Add the decay factor
for index, row in elearning.iterrows():
    currentUser = row['UserID']
    currentContent = row['CourseID']

    w = eventWeights[row['Event']] * compute_decay(row['Date'],thresholdDays)

    currentValue = uiMatrix.at[currentUser, currentContent]
    if np.isnan(currentValue):
        currentValue = 0

    updatedValue = currentValue + w #+ (1 * w)
    uiMatrix.at[currentUser, currentContent] = updatedValue

uiMatrix.head()

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
440,0.012165,,,,,,,,,,...,,,,,,,,,,
1186,,0.048662,,,,,,,,,...,,,,,,,,,,
703,,,0.012165,,,,,,,,...,,,,,0.1375,,,,,
225,,,,0.012165,,,,,,,...,,,,,,,,,,
1334,,,,,0.012165,,,,,,...,,,,0.141026,,,,,,


## Normalization

In [19]:
#Update the user-item matrix by normalizing the values between 0 and 10.
uiMatrixNorm = uiMatrix.apply(
    lambda x: ((x - np.nanmin(uiMatrix.values))/(np.nanmax(uiMatrix.values) - np.nanmin(uiMatrix.values)))*10
    )

uiMatrixNorm.head()

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
440,0.0,,,,,,,,,,...,,,,,,,,,,
1186,,0.556215,,,,,,,,,...,,,,,,,,,,
703,,,0.0,,,,,,,,...,,,,,1.910135,,,,,
225,,,,0.0,,,,,,,...,,,,,,,,,,
1334,,,,,0.0,,,,,,...,,,,1.963867,,,,,,


## User-based Function
Step1: Compute Similarity between the active user and the rest of the users

Step2: Select the items whose values are to be predicted for current user

Step3: Compute the predicted ratings for current user.
Using mean value of the other high-similarity users

In [20]:
from scipy.spatial.distance import jaccard, cosine
from scipy.stats import spearmanr

def userCF_prediction(df, currentUser, numUsers, numItems, similarity='pearson'):
    df = df.astype(float)
    df_copy = df.copy()
    cuDf = df_copy.loc[currentUser]

    if similarity == 'pearson':
        similarityDf = df_copy.corrwith(cuDf, axis=1, method='pearson')
    elif similarity == 'jaccard':
        similarityDf = df_copy.apply(lambda row: 1 - jaccard(cuDf, row), axis=1)
    elif similarity == 'cosine':
        similarityDf = df_copy.apply(lambda row: 1 - cosine(cuDf, row), axis=1)
    elif similarity == 'spearman':
        similarityDf = df_copy.apply(lambda row: spearmanr(cuDf, row).correlation, axis=1)
    else:
        raise ValueError("Invalid similarity measure. Available options are 'pearson', 'jaccard', 'cosine', and 'spearman'.")

    similarityDf.sort_values(ascending=False, inplace=True)
    similarityDf.drop(labels=[currentUser], inplace=True)
    similarityDf = similarityDf.head(numUsers)

    toPredict = cuDf[cuDf.isna()]

    ratings = df_copy.loc[similarityDf.index]
    ratingsToPredict = ratings[toPredict.index]

    predictedRatings = ratingsToPredict.mean()
    predictedRatings.sort_values(ascending=False, inplace=True)

    return predictedRatings.head(numItems)

In [21]:
#Find one without all the ratings already filled to recommend something for him
uiMatrixNorm.isna().sum(axis=1)

440     391
1186    390
703     392
225     390
1334    391
       ... 
659     395
66      395
612     395
483     395
956     395
Length: 1591, dtype: int64

### Pearson Showcase

In [22]:
userCF_prediction(uiMatrixNorm,440,200,5,'pearson')


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply



326    6.742000
259    5.631500
236    5.023302
196    5.023302
186    4.958193
dtype: float64

In [23]:
userCF_prediction(uiMatrixNorm,823,100,5,'pearson')


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply



359    5.616736
79     5.445244
42     5.063164
236    5.023302
33     4.997062
dtype: float64

### Spearman Showcase

In [24]:
userCF_prediction(uiMatrixNorm,440,200,5,'spearman')

326    6.742000
259    5.631500
236    5.023302
196    5.023302
186    4.958193
dtype: float64

In [25]:
userCF_prediction(uiMatrixNorm,823,100,5,'spearman')

359    5.616736
79     5.445244
42     5.063164
236    5.023302
214    4.997062
dtype: float64

# Item-based Collaborative Filtering

Use the cosine distance in scipy library to compute the similarity.

$$cosine\_similarity = 1 - cosine\_distance$$

In [26]:
from scipy.spatial.distance import cosine

def cosine_sim(df1, df2):
    # check for na in dataframes
    df1na = df1.isna()
    df1clean = df1[~df1na]
    df2clean = df2[~df1na]

    df2na = df2clean.isna()
    df1clean = df1clean[~df2na]
    df2clean = df2clean[~df2na]


    # Compute cosine similarity
    distance = cosine(df1clean, df2clean)
    sim = 1 - distance

    return sim

## Precomputing similarities

A utility function to convert from numeric to boolean, indicating if the user rated an item.

In [27]:
def to_bool(value):
    if np.isnan(value):
        return 0
    else:
        return 1

Compute the number of overlapping rating between each item

In [28]:
uiMatrixBool = uiMatrix.applymap(lambda x: to_bool(x))
overlappingUsersRatings = uiMatrixBool.T.dot(uiMatrixBool)


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



In [29]:
display(overlappingUsersRatings)

Unnamed: 0,247,338,238,350,264,139,9,20,8,288,...,341,329,73,115,100,89,50,306,199,318
247,45,1,0,0,2,0,2,0,0,0,...,0,0,0,1,2,2,0,1,0,1
338,1,33,0,1,2,2,3,0,1,1,...,0,0,0,0,0,0,0,0,0,0
238,0,0,21,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
350,0,1,0,28,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
264,2,2,0,0,29,0,0,0,0,1,...,0,0,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,7,0,0,0,0
50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
306,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
199,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,0


Check min and max

In [30]:
overlappingUsersRatings.max(axis=0).max()

48

In [31]:
overlappingUsersRatings.min(axis=0).min()

0

In [32]:
toDrop = overlappingUsersRatings.max(axis=0) > 10
display(toDrop)

247     True
338     True
238     True
350     True
264     True
       ...  
89     False
50     False
306    False
199    False
318    False
Length: 396, dtype: bool

In [33]:
selectedItems = overlappingUsersRatings.loc[toDrop, toDrop]
display(selectedItems)

Unnamed: 0,247,338,238,350,264,139,9,20,288,373,...,300,27,167,290,76,28,23,121,128,100
247,45,1,0,0,2,0,2,0,0,0,...,1,0,1,0,0,1,0,1,1,2
338,1,33,0,1,2,2,3,0,1,0,...,2,0,0,1,0,0,0,0,0,0
238,0,0,21,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
350,0,1,0,28,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
264,2,2,0,0,29,0,0,0,1,0,...,0,0,1,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,14,1,0,0,0
23,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,19,0,0,0
121,1,0,0,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,22,0,0
128,1,0,0,0,0,1,1,1,0,1,...,0,0,1,0,0,0,0,0,12,1


Compute item-item similarity

In [34]:
selectedIndex = selectedItems.index
uiMatrixSelection = uiMatrix[selectedIndex]
uiMatrixSelection = uiMatrixSelection.sub(uiMatrixSelection.mean(axis=1), axis=0)
iiSimMatrix = pd.DataFrame().reindex_like(selectedItems)
for item1 in selectedIndex:
    item1Ratings = uiMatrixSelection[item1]
    for item2 in selectedIndex:
        item2Ratings = uiMatrixSelection[item2]
        if overlappingUsersRatings.at[item1, item2] > 1:
          iiSimMatrix.at[item1, item2] = cosine_sim(item1Ratings, item2Ratings)


In [35]:
display(iiSimMatrix)

Unnamed: 0,247,338,238,350,264,139,9,20,288,373,...,300,27,167,290,76,28,23,121,128,100
247,1.000000,,,,0.902221,,0.555371,,,,...,,,,,,,,,,-0.874059
338,,1.000000,,,0.346208,-0.859938,-0.588258,,,,...,-0.105315,,,,,,,,,
238,,,1.0,,,,,,,,...,,,,,,,,,,
350,,,,1.0,,,,,,,...,,,,,,,,,,
264,0.902221,0.346208,,,1.000000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,,,,,,,,,,,...,,,,,,1.0,,,,
23,,,,,,,,,,,...,,,,,,,1.0,,,
121,,,,,,,,,,,...,,,,,,,,1.0,,
128,,,,,,,,,,,...,,,,,,,,,1.0,


## Item score predict function 
based on precomputed similarity matrix

In [52]:
def item_predict(df, currentUser, currentItem, simMatrix, k):
    # df: the UI matrix dataframe ; currentUser currentItem: the current user and item for whom the prediction is to be made
    # simMatrix: the precomputed simMatrix ; k: for top-k
    
    uiMatrixNorm = df.astype(float)
    uiMatrixSelection = uiMatrixNorm.dropna(subset = [currentItem])
    cuAvgRating = uiMatrixNorm.loc[[currentUser]].dropna(axis=1).mean(axis=1)

    # Subtract the average rating of each user from that user's rating to eliminate the effect of different rating scales for different users
    uiMatrixSelection = uiMatrixSelection.sub(uiMatrixSelection.mean(axis=1), axis=0)
    uiMatrixSelection.drop(columns=[currentItem], inplace=True)
    iiSimilarity = simMatrix.loc[currentItem].dropna().sort_values(ascending=False).drop(index=currentItem)

    # Select the top k items with the highest similarity
    itemsToCompare = uiMatrixSelection[iiSimilarity.head(k).index]
    predictedRatings = itemsToCompare.mean(axis=0)
    predictedRatings.sort_values(ascending=False, inplace=True)

    # Add the average user rating back to the predicted rating
    predictedRatings += cuAvgRating.iloc[0]

    # The average of all predicted ratings is used as the final predicted rating of the current user for the current item
    return predictedRatings.mean()


## Item-based Recommand Function


In [47]:
def itemCF_recommend(df, currentUser, simMatrix, numItems, k):
    # df: the UI matrix dataframe ; currentUser: the current user for whom the recommand is to be made
    # simMatrix: the precomputed simMatrix ; numItems: the number of items to be recommanded ; k: for top-k

    # Get items not rated by the current user
    cuRatedItems = df.loc[currentUser].dropna().sort_values(ascending=False)
    unRated = df.loc[currentUser]
    unRated.drop(labels=cuRatedItems.index, inplace=True)

    toRec = []
    # Iterate through the unrated items and call the `item_predict` function for each item to predict the rating
    for item in unRated.index:
        if item in simMatrix.index:
            preRate = item_predict(df, currentUser, item, simMatrix,k)
            if not np.isnan(preRate):
                toRec.append((preRate, item))

    sortedRec = sorted(toRec, key=lambda x: x[0], reverse=True)
    return sortedRec[0:numItems]


## Tests and parameter change tests

Recommand for user 429, with numItems is 10 and k is 5 :

In [48]:
recSer = itemCF_recommend(uiMatrixNorm,429,iiSimMatrix,10,5)
for item in recSer:
    print(str(item[1])+'\t'+str(item[0]))

110	2.8883366731423927
28	2.716957202728407
2	2.6940880130655303
162	2.6549257534199766
158	2.652896392271386
147	2.6360268022238467
333	2.6142919136136564
35	2.6016886299274313
317	2.5994868883929616
220	2.5993010627957878


Recommand for user 429, with numItems is 10 and k is 10 :

In [50]:
recSer = itemCF_recommend(uiMatrixNorm,429,iiSimMatrix,10,10)
for item in recSer:
    print(str(item[1])+'\t'+str(item[0]))

2	2.6940880130655303
333	2.578588791694405
63	2.534185053305283
74	2.4220558731144015
45	2.3926993687419325
377	2.389226418767394
365	2.377949521253248
60	2.351251444610313
309	2.3421352988707778
382	2.335101655122364


Recommand for user 347 with numItems is 10 and k is 5 :

In [49]:
recSer = itemCF_recommend(uiMatrixNorm,347,iiSimMatrix,10,5)
for item in recSer:
    print(str(item[1])+'\t'+str(item[0]))

110	2.4316732398110568
32	2.3933770195350594
283	2.306522929299524
28	2.260293769397071
2	2.2374245797341943
162	2.1982623200886406
158	2.19623295894005
147	2.1793633688925107
333	2.1576284802823205
35	2.1450251965960954


Recommand for user 347 with numItems is 10 and k is 10 :

In [53]:
recSer = itemCF_recommend(uiMatrixNorm,347,iiSimMatrix,10,10)
for item in recSer:
    print(str(item[1])+'\t'+str(item[0]))

2	2.2374245797341943
333	2.121925358363069
63	2.0775216199739472
74	1.9653924397830658
377	1.932562985436058
32	1.921545671570162
365	1.9212860879219125
60	1.894588011278977
309	1.8854718655394418
382	1.8784382217910278
