In [55]:
import os
import pandas as pd

In [56]:
main_dir = r'C:\Users\ashwi\Documents\datascience\data'

user_df = pd.read_csv(os.path.join(main_dir, 'raw_User.csv'))
department_df = pd.read_csv(os.path.join(main_dir, 'raw_Department.csv'))
course_df = pd.read_csv(os.path.join(main_dir, 'raw_Course.csv'))
course_department_df = pd.read_csv(os.path.join(main_dir, 'raw_CourseDepartment.csv'))
course_users_df = pd.read_csv(os.path.join(main_dir, 'raw_CourseUser.csv'))
skill_df = pd.read_csv(os.path.join(main_dir, 'raw_Skill.csv'))
skill_department_df = pd.read_csv(os.path.join(main_dir, 'raw_SkillDepartment.csv'))
skill_user_df = pd.read_csv(os.path.join(main_dir, 'raw_SkillUSers.csv'))


In [57]:
# Data handling

user_df = user_df.drop(columns=['first_name', 'last_name', 'email', 'password', 'account_type', 'createdAt', 'updatedAt', 'data_loaded_at'])
user_df = user_df[user_df['dept_id'] != 1]


In [58]:
user_df.head()

Unnamed: 0,user_id,dept_id
1,2,2
2,3,2
3,4,2
4,5,2
5,6,2


In [59]:
skill_user_df = skill_user_df.drop(columns=['data_loaded_at'])
course_users_df = course_users_df.drop(columns=['data_loaded_at'])


In [60]:
user_skills = skill_user_df.groupby('user_id')['skill_id'].apply(list).reset_index()
user_courses = course_users_df.groupby('user_id')['course_id'].apply(list).reset_index()

In [61]:
user_skills.columns = ['user_id', 'skills']
user_courses.columns = ['user_id', 'courses']

In [62]:
user_combined = pd.merge(user_df, user_skills, on='user_id', how='left')
user_combined = pd.merge(user_combined, user_courses, on='user_id', how='left')

In [63]:
null_columns = user_combined.isnull().sum()
null_columns

user_id     0
dept_id     0
skills     29
courses    29
dtype: int64

In [64]:
user_combined['skills'] = user_combined['skills'].apply(lambda x: x if isinstance(x, list) else [])
user_combined['courses'] = user_combined['courses'].apply(lambda x: x if isinstance(x, list) else [])


In [65]:
null_columns = user_combined.isnull().sum()
null_columns

user_id    0
dept_id    0
skills     0
courses    0
dtype: int64

In [66]:
user_combined['skills'] = user_combined['skills'].apply(lambda skills: ['skill_' + str(skill) for skill in skills])
user_combined['courses'] = user_combined['courses'].apply(lambda courses: ['course_' + str(course) for course in courses])

In [67]:
user_combined['features'] = user_combined.apply(lambda x: x['skills'] + x['courses'], axis=1)

In [68]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [69]:
mlb = MultiLabelBinarizer()
user_features = mlb.fit_transform(user_combined['features'])


In [71]:
user_feature_df = pd.DataFrame(user_features, columns=mlb.classes_)
user_feature_df['user_id'] = user_combined['user_id'].values
user_feature_df.set_index('user_id', inplace=True)

Unnamed: 0_level_0,course_1,course_10,course_100,course_1000,course_1001,course_1002,course_1003,course_1004,course_1005,course_1006,...,skill_90,skill_91,skill_92,skill_93,skill_94,skill_95,skill_96,skill_97,skill_98,skill_99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(user_feature_df)

similarity_df = pd.DataFrame(similarity_matrix, index=user_feature_df.index, columns=user_feature_df.index)

similarity_df.head()

user_id,2,3,4,5,6,7,8,9,10,11,...,790,791,792,793,794,795,796,797,798,799
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,0.917646,0.924054,0.915975,0.929553,0.836123,0.886506,0.913296,0.829684,0.919815,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.917646,1.0,0.956664,0.951145,0.961771,0.867487,0.911647,0.946953,0.866637,0.951335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.924054,0.956664,1.0,0.964215,0.977497,0.879371,0.924925,0.965009,0.8874,0.972423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.915975,0.951145,0.964215,1.0,0.969238,0.871828,0.920298,0.956379,0.878157,0.957382,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.929553,0.961771,0.977497,0.969238,1.0,0.881871,0.930414,0.968411,0.886324,0.97092,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
def get_top_n_similar_users_by_dept(similarity_df, user_id, user_combined, n=5):
    """
    Returns the top N most similar users for a given user_id, filtered by department.
    
    Args:
    - similarity_df: DataFrame containing the similarity matrix.
    - user_id: The user for whom we are finding similar users.
    - user_combined: DataFrame with users and their department IDs.
    - n: The number of top similar users to return.
    
    Returns:
    - A Series containing the top N similar users from the same department and their similarity scores.
    """
    # Get the department ID of the target user
    user_dept_id = user_combined[user_combined['user_id'] == user_id]['dept_id'].values[0]
    
    # Get similar users and their similarity scores
    similar_users = similarity_df[user_id].sort_values(ascending=False).drop(user_id)
    
    # Filter similar users by department
    similar_users_in_dept = similar_users[similar_users.index.isin(user_combined[user_combined['dept_id'] == user_dept_id]['user_id'])]
    
    return similar_users_in_dept.head(n)

# Example usage: Get the top similar users from the same department for user_id 678
top_similar_users_by_dept = get_top_n_similar_users_by_dept(similarity_df, 678, user_combined, n=5)
print(top_similar_users_by_dept)

user_id
71    0.620563
72    0.608130
81    0.606655
70    0.591882
74    0.587923
Name: 678, dtype: float64


In [88]:
def recommend_courses_for_user(user_id, top_similar_users, user_combined, course_department_df):
    """
    Recommends courses to a user based on the courses of their similar users.
    
    Args:
    - user_id: The target user for whom recommendations are made.
    - top_similar_users: Series of top similar users for the target user.
    - user_combined: DataFrame with users and their enrolled courses.
    - course_department_df: DataFrame with course-department mapping.
    
    Returns:
    - A list of recommended courses.
    """
    # Get the courses the target user is already enrolled in
    target_user_courses = set(user_combined[user_combined['user_id'] == user_id]['courses'].values[0])
    
    # Get the department the user belongs to
    user_dept_id = user_combined[user_combined['user_id'] == user_id]['dept_id'].values[0]
    
    # Get courses enrolled by similar users
    similar_users_courses = set()
    for similar_user in top_similar_users.index:
        courses = user_combined[user_combined['user_id'] == similar_user]['courses'].values[0]
        similar_users_courses.update(courses)
    
    # Remove courses the target user is already enrolled in
    recommended_courses = similar_users_courses - target_user_courses
    
    # Filter by department access (only recommend courses the user has access to)
    accessible_courses = course_department_df[course_department_df['dept_id'] == user_dept_id]['course_id'].values
    recommended_courses = [course for course in recommended_courses if course in accessible_courses]
    
    return recommended_courses

# Example usage: Recommend courses for user_id 2 based on their top similar users
recommended_courses = recommend_courses_for_user(678, top_similar_users_by_dept, user_combined, course_department_df)
print(recommended_courses)

[]


In [89]:
# Initialize a counter for users who received recommendations
users_with_recommendations_count = 0

# Loop through user IDs from 1 to 799
for user_id in range(1, 800):
    try:
        # Get top similar users for the current user
        top_similar_users_by_dept = get_top_n_similar_users_by_dept(similarity_df, user_id, user_combined, n=5)
        
        # Recommend courses for the current user based on their top similar users
        recommended_courses = recommend_courses_for_user(user_id, top_similar_users_by_dept, user_combined, course_department_df)
        
        # Check if any courses were recommended
        if recommended_courses:
            users_with_recommendations_count += 1
    except Exception as e:
        # Handle the case where user_id may not be present
        print(f"User ID {user_id} not found or an error occurred: {e}")

# Print the total number of users that received recommendations
print(f"Total number of users who received recommendations: {users_with_recommendations_count}")


User ID 1 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
User ID 100 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
User ID 200 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
User ID 300 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
User ID 400 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
User ID 500 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
User ID 600 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
User ID 700 not found or an error occurred: index 0 is out of bounds for axis 0 with size 0
Total number of users who received recommendations: 0
