In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import random

# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Online_courses_final1_updated.csv'
data = pd.read_csv(file_path)
data['Instructors'] = data['Instructors'].fillna("")

users = []
for i in range(1000):
    # Randomly assign user interests
    interests = random.sample(data['Category'].dropna().unique().tolist(), k=random.randint(1, 3))
    interest_courses = data[data['Category'].isin(interests)]  # Define interest_courses here

    if random.random() < 0.75:  # 75% probability of English
        preferred_language = 'English'
    else:
        preferred_language = interest_courses['Language'].value_counts().idxmax() if not interest_courses.empty else 'English'

    user = {
        'user_id': i,
        'interests': interests,
        'skill_level': random.choice(['Beginner', 'Intermediate', 'Advanced']),
        'language': preferred_language  # Set the preferred language
    }

    available_instructors = interest_courses['Instructors'].dropna().unique()  # interest_courses is now defined
    user['preferred_instructors'] = random.sample(list(available_instructors), k=min(2, len(available_instructors)))

    users.append(user)

def assign_courses(user, df):
    user_courses = []
    assigned_courses = set()  # Track assigned courses to avoid duplicates
    dominant_attr = random.choices(['Category', 'Skills'], weights=[0.6, 0.4], k=1)[0]

    for i in range(10):  # Assuming each user enrolls in 10 courses
        dominant_courses = df[(df[dominant_attr].isin(user['interests'])) & (~df['CourseID'].isin(assigned_courses))]
        dominant_prob = random.uniform(0.5, 0.7)

        if random.random() < dominant_prob and not dominant_courses.empty:
            course = dominant_courses.sample(1).iloc[0]
        else:
            non_duplicate_courses = df[~df['CourseID'].isin(assigned_courses)]
            course = non_duplicate_courses.sample(1).iloc[0] if not non_duplicate_courses.empty else None

        if course is None:
            break

        secondary_courses = dominant_courses[dominant_courses['Skills'].str.contains('|'.join(user['interests']), na=False)]
        if not secondary_courses.empty:
            course = secondary_courses.sample(1).iloc[0]

        # Language preference check
        if user['language'] in course['Language']:
            selected_course = course
        else:
            language_courses = df[(df['Language'] == user['language']) & (~df['CourseID'].isin(assigned_courses))]
            selected_course = language_courses.sample(1).iloc[0] if not language_courses.empty else course

        # Mark course as assigned to this user
        assigned_courses.add(selected_course['CourseID'])

        # Assign rating based on course relevance
        if selected_course['Instructors'] in user['preferred_instructors']:
            rating = round(random.uniform(4.5, 5.0), 1)
        elif selected_course[dominant_attr] in user['interests']:
            rating = round(random.uniform(3.5, 4.5), 1)
        else:
            rating = round(random.uniform(2.0, 3.5), 1)

        user_courses.append({
            'user_id': user['user_id'],
            'course_id': selected_course['CourseID'],
            'rating': rating,
            'language': selected_course['Language'],
            'preferred_instructor': selected_course['Instructors'] in user['preferred_instructors']
        })

    return pd.DataFrame(user_courses)

user_courses_df = pd.DataFrame()
for user in users:
    user_courses_df = pd.concat([user_courses_df, assign_courses(user, data)], ignore_index=True)

user_courses_df.to_csv('user_course_ratings1.csv', index=False)
print("Dataset generated with user-specific course assignments and ratings for 1000 users.")



In [None]:
from google.colab import files
files.download('user_course_ratings1.csv')