In [1]:
import os
import pandas as pd

In [2]:
main_dir = r'C:\Users\AshwinR\Documents\project\datascience\data'

user_df = pd.read_csv(os.path.join(main_dir, 'raw_User.csv'))
department_df = pd.read_csv(os.path.join(main_dir, 'raw_Department.csv'))
course_df = pd.read_csv(os.path.join(main_dir, 'raw_Course.csv'))
course_department_df = pd.read_csv(os.path.join(main_dir, 'raw_CourseDepartment.csv'))
course_users_df = pd.read_csv(os.path.join(main_dir, 'raw_CourseUser.csv'))
skill_df = pd.read_csv(os.path.join(main_dir, 'raw_Skill.csv'))
skill_department_df = pd.read_csv(os.path.join(main_dir, 'raw_SkillDepartment.csv'))
skill_user_df = pd.read_csv(os.path.join(main_dir, 'raw_SkillUSers.csv'))


# Collabrative Filtering

In [3]:
# Data handling

user_df = user_df.drop(columns=['first_name', 'last_name', 'email', 'password', 'account_type', 'createdAt', 'updatedAt', 'data_loaded_at'])
user_df = user_df[user_df['dept_id'] != 1]

skill_user_df = skill_user_df.drop(columns=['data_loaded_at'])
course_users_df = course_users_df.drop(columns=['data_loaded_at'])


In [4]:
user_skills = skill_user_df.groupby('user_id')['skill_id'].apply(list).reset_index()
user_courses = course_users_df.groupby('user_id')['course_id'].apply(list).reset_index()

user_skills.columns = ['user_id', 'skills']
user_courses.columns = ['user_id', 'courses']

In [5]:
user_combined = pd.merge(user_df, user_skills, on='user_id', how='left')
user_combined = pd.merge(user_combined, user_courses, on='user_id', how='left')

In [6]:
user_combined['skills'] = user_combined['skills'].apply(lambda x: x if isinstance(x, list) else [])
user_combined['courses'] = user_combined['courses'].apply(lambda x: x if isinstance(x, list) else [])


In [7]:
user_combined['skills'] = user_combined['skills'].apply(lambda skills: ['skill_' + str(skill) for skill in skills])
user_combined['courses'] = user_combined['courses'].apply(lambda courses: ['course_' + str(course) for course in courses])

user_combined['features'] = user_combined.apply(lambda x: x['skills'] + x['courses'], axis=1)

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
user_features = mlb.fit_transform(user_combined['features'])

In [9]:
user_feature_df = pd.DataFrame(user_features, columns=mlb.classes_)
user_feature_df['user_id'] = user_combined['user_id'].values
user_feature_df.set_index('user_id', inplace=True)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(user_feature_df)

similarity_df = pd.DataFrame(similarity_matrix, index=user_feature_df.index, columns=user_feature_df.index)


In [11]:
def get_top_n_similar_users(similarity_df, user_id, n=5):
    """
    Returns the top N most similar users for a given user_id, without department filtering.
    
    Args:
    - similarity_df: DataFrame containing the similarity matrix.
    - user_id: The user for whom we are finding similar users.
    - n: The number of top similar users to return.
    
    Returns:
    - A Series containing the top N similar users and their similarity scores.
    """
    similar_users = similarity_df[user_id].sort_values(ascending=False).drop(user_id)
    return similar_users.head(n)

In [12]:
def recommend_courses_for_user(user_id, top_similar_users, user_combined, course_df, top_n=5):
    """
    Recommends courses to a user based on the courses of their similar users, without department filtering.
    
    Args:
    - user_id: The target user for whom recommendations are made.
    - top_similar_users: Series of top similar users for the target user.
    - user_combined: DataFrame with users and their enrolled courses.
    - course_df: DataFrame containing course details (course_id, course_name).
    - top_n: Number of top recommended courses to return.
    
    Returns:
    - A DataFrame with the top N recommended courses (course_id, course_name).
    """
    target_user_courses = set(user_combined[user_combined['user_id'] == user_id]['courses'].values[0])
    
    similar_users_courses = set()
    for similar_user in top_similar_users.index:
        courses = user_combined[user_combined['user_id'] == similar_user]['courses'].values[0]
        similar_users_courses.update(courses)
    
    recommended_courses = similar_users_courses - target_user_courses
    
    recommended_df = pd.DataFrame({
        'course_id': [int(course.split('_')[1]) for course in recommended_courses],  
        'score': [1] * len(recommended_courses)  
    })
    
    recommended_df = recommended_df.merge(course_df[['course_id', 'course_name']], on='course_id', how='left')
    
    return recommended_df[['course_id', 'course_name']].head(top_n)


# Content Filtering

In [13]:
# Loading dataset again
main_dir = r'C:\Users\AshwinR\Documents\project\datascience\data'

course_df = pd.read_csv(os.path.join(main_dir, 'raw_Course.csv'))

In [14]:
course_df = course_df[['course_id', 'course_name', 'course_desc', 'course_creator']]

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

course_df['course_features'] = course_df['course_name'] + " " + course_df['course_desc']

tfidf = TfidfVectorizer(stop_words='english')
course_tfidf_matrix = tfidf.fit_transform(course_df['course_features'])

cosine_sim = cosine_similarity(course_tfidf_matrix, course_tfidf_matrix)

In [16]:
def recommend_courses(course_ids, num_recommendations=5):
    if not course_ids:
        return pd.DataFrame(columns=['course_id', 'course_name'])

    score_dict = {}
    
    for course_id in course_ids:
        idx = course_df.index[course_df['course_id'] == course_id].tolist()
        
        if idx:  
            idx = idx[0]
            sim_scores = list(enumerate(cosine_sim[idx]))
            
            for i, score in sim_scores:
                if i in score_dict:
                    score_dict[i] += score
                else:
                    score_dict[i] = score

    sorted_scores = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top_courses = sorted_scores[:num_recommendations]

    course_indices = [i[0] for i in top_courses]
    
    return course_df.iloc[course_indices][['course_id', 'course_name']]

In [19]:
def recommend_courses_for_user_combined(user_id, course_ids, user_combined, course_df, num_recommendations=10):
    """
    Recommends courses to a user by combining collaborative filtering and content-based filtering.
    
    Args:
    - user_id: The target user for whom recommendations are made.
    - course_ids: List of course_ids that the user is interested in.
    - user_combined: DataFrame with users and their enrolled courses.
    - course_df: DataFrame containing course details (course_id, course_name).
    - top_similar_users: Series of top similar users for the target user from collaborative filtering.
    - num_recommendations: Total number of recommendations to return.
    
    Returns:
    - A DataFrame with combined course recommendations (course_id, course_name).
    """
    similar_users = get_top_n_similar_users(similarity_df, user_id, n=5)

    collaborative_recommendations = recommend_courses_for_user(user_id, similar_users, user_combined, course_df, top_n=num_recommendations)

    content_recommendations = recommend_courses(course_ids, num_recommendations)
    
    combined_recommendations = pd.concat([collaborative_recommendations, content_recommendations]).drop_duplicates().reset_index(drop=True)
    
    if len(combined_recommendations) > num_recommendations:
        return combined_recommendations.head(num_recommendations)
    
    return combined_recommendations

user_id = 678
course_ids = [1, 2, 3]  
final_recommendations = recommend_courses_for_user_combined(user_id, course_ids, user_combined, course_df, num_recommendations=10)
print(final_recommendations)

   course_id                                        course_name
0       1132         Practical Introduction to the Command Line
1        404      Blockchain Opportunities Beyond Crypto Assets
2        755  Elastic Google Cloud Infrastructure: Scaling a...
3         33  Programming Mobile Applications for Android Ha...
4       2355  Innovation Through Design: Think, Make, Break,...
5          7       Agile Projects:  Developing Tasks with Taiga
6       1404    Story and Narrative Development for Video Games
7       1937              Introduction to Meteor.js Development
8        133           Master Class for Corporate Entrepreneurs
9        869            Design a Client Welcome Kit using Canva
