In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
df_courses = pd.read_csv("Udemy.csv")
df_courses = df_courses.dropna()
df_courses['id'] = df_courses['id'].astype(int)
df_courses.head()

Unnamed: 0,id,title,is_paid,price,headline,num_subscribers,avg_rating,num_reviews,num_comments,num_lectures,content_length_min,published_time,last_update_date,category,subcategory,topic,language,course_url,instructor_name,instructor_url
0,4715,Online Vegan Vegetarian Cooking School,True,24.99,"Learn to cook delicious vegan recipes. Filmed over 15 years ago, watch the first 2hrs FREE to see if it's right for you.",2231.0,3.75,134.0,42.0,37.0,1268.0,2010-08-05T22:06:13Z,2020-11-06,Lifestyle,Food & Beverage,Vegan Cooking,English,/course/vegan-vegetarian-cooking-school/,Angela Poch,/user/angelapoch/
2,5664,"How To Become a Vegan, Vegetarian, or Flexitarian",True,19.99,Get the tools you need for a lifestyle change that will bring you health and a clear conscience.,1713.0,4.4,41.0,13.0,14.0,82.0,2010-10-13T18:07:17Z,2019-10-09,Lifestyle,Other Lifestyle,Vegan Cooking,English,/course/see-my-personal-motivation-for-becoming-vegetarian/,Angela Poch,/user/angelapoch/
3,7723,How to Train a Puppy,True,199.99,"Train your puppy the right way with Dr. Ian Dunbar. Includes 13 videos, 4 books, and 16 behavior blueprints.",4988.0,4.8,395.0,88.0,36.0,1511.0,2011-06-20T20:08:38Z,2016-01-13,Lifestyle,Pet Care & Training,Pet Training,English,/course/complete-dunbar-collection/,Ian Dunbar,/user/ian-dunbar/
5,8139,14-Day Yoga Detox and Empowerment Course,True,29.99,"Lose weight, get healthier and fit on all levels in just 14 days with Sadie Nardini",20505.0,4.53012,796.0,135.0,31.0,1163.0,2011-07-15T04:13:24Z,2018-05-22,Health & Fitness,Yoga,Yoga,English,/course/yoga-for-weight-loss-and-core-strength-with-sadie-nardini/,Sadie Nardini,/user/sadienardini/
6,2762,Simple Strategy for Swing Trading the Stock Market,True,39.99,Use my favorite Technical Indicator and the Trading Strategy I've developed for Swing Trading Stocks,3309.0,3.85,958.0,241.0,8.0,80.0,2010-04-14T16:32:46Z,2019-03-07,Finance & Accounting,Investing & Trading,Swing Trading,English,/course/swing-trading-the-stock-market/,Tom Watson,/user/tomwatson/


In [4]:
# Define a list of interests based on the course categories
all_interests = df_courses['category'].unique().tolist()

# Define a list of fake names and ages
fake_names = ['Alyssa Perez', 'Justin Peters', 'Brittany Brown', 'Robert Morris', 'John Smith', 'Tara Mitchell', 'Megan Garcia', 'Heather Perry', 'Brenda Gonzales', 'Erik Johnson', 'Samantha Thomas', 'Cynthia Kelly', 'Stephanie Hernandez', 'Chad Lewis', 'Sara Ortiz', 'Kara Johnson', 'Michael Anderson', 'Ryan Nguyen', 'Gabriel Rodriguez', 'David Wright', 'Mariah Johnson', 'Jasmine Moore', 'Jessica Martinez', 'Jordan Williams', 'Melissa Ortiz', 'Carla Rodriguez', 'Lindsay Brown', 'Teresa Hernandez', 'Mason Anderson', 'Mark Thompson', 'Makayla Wilson', 'Dustin Williams', 'Nicholas Rodriguez', 'Jeffrey Perez', 'Jeremy Johnson', 'Emily Scott', 'Brooke Jones', 'Justin Lewis', 'Jacob Martin', 'Rebecca Wilson', 'Anthony Wright', 'Daniel Robinson', 'Diana Martinez', 'Derek Smith', 'Katie Jones', 'Caleb Taylor', 'Kevin Williams', 'Cassandra Ramirez', 'Erica Smith', 'Matthew Martin', 'Lauren Hernandez', 'Olivia Perez', 'Diana Martinez', 'Erica Smith', 'Matthew Martin', 'Lauren Hernandez', 'Olivia Perez', 'Joshua Clark', 'Trevor Gonzalez', 'Lauren Ramirez', 'Richard Wilson', 'Karen Lee', 'Samantha Wilson', 'Victoria Smith', 'Brandon Gonzalez', 'Brianna Taylor', 'Jennifer Lee', 'Randy Hill', 'Rachel Perez', 'Kaitlyn Taylor', 'Nathan Rodriguez', 'Lily Allen', 'Adam Davis', 'Haley Gonzalez', 'Jacob Taylor', 'Erin Smith', 'Kristen Lee', 'Sarah Lee', 'Rebecca Walker', 'Chloe Taylor', 'Gabrielle Clark', 'Jacob Hernandez', 'Courtney Martinez', 'Alexander Walker', 'Mariah Garcia', 'Sabrina Davis', 'Travis Hill', 'Emily Ramirez', 'Maggie Perez', 'Victoria Davis', 'Steven Johnson', 'Tara Hill', 'Taylor Rodriguez', 'Eric Brown', 'Matthew Allen', 'David Allen', 'Eva Mitchell', 'Isabella Lewis', 'Nicole Thomas', 'Samantha Jones', 'Brooke Wright', 'Danielle Thompson']
fake_ages = [random.randint(20, 60) for i in range(len(fake_names))]

In [5]:
def generate_rating():
    rating = random.gauss(4.5, 0.5) # Generate a rating using a normal distribution with mean 4.5 and standard deviation 0.5
    rating = round(rating * 2) / 2 # Round the rating to the nearest 0.5
    rating = min(max(rating, 3), 5) # Clip the rating to the range [3, 5]
    return rating

In [6]:
def generate_timestamp():
    now = datetime.now()
    two_years_ago = now - timedelta(days=365*2)
    one_year_ago = now - timedelta(days=365)

    timestamp = random.uniform(two_years_ago.timestamp(), one_year_ago.timestamp())
    dt_object = datetime.fromtimestamp(timestamp)
    formatted_time = dt_object.strftime("%Y-%m-%d %H:%M:%S")
    return formatted_time

## Generates fake user profiles and user ratings

In [7]:
def generate_user_profiles(num_users):
    users = []
    user_ratings = []
    rated_courses = pd.DataFrame(columns=df_courses.columns)
    for i in range(num_users):
        # Randomly select a name and age for the user
        name, age = random.choice(fake_names), random.choice(fake_ages)
        
        # Randomly select the user's preferred interest and a few additional interests
        preferred_interest = random.choice(all_interests)
        other_interests = list(set(df_courses['category'].tolist()) - {preferred_interest})
        user_interests = [preferred_interest] + random.sample(other_interests, random.randint(0, 1))
        
        # Select courses for the user based on their interests
        user_courses = []
        for interest in user_interests:
            category_courses = df_courses[df_courses['category'] == interest]
            if category_courses.empty:
                continue
            
            # Select 3 courses of the preferred interest and 1 or less courses from a random interest.
            course_count = min(random.randint(3, 6), len(category_courses)) if interest == preferred_interest else random.randint(1, 3)
            #courses = random.sample(category_courses['id'].tolist(), k=course_count)
            courses = category_courses.sample(course_count)
            rated_courses = pd.concat([rated_courses, courses], ignore_index=True)

            for index, course in courses.iterrows():
                rating = generate_rating()
                timestamp = generate_timestamp()
                user_rating = {'userId': i, 'courseId': course['id'], 'rating': rating, 'timestamp': timestamp}
                user_ratings.append(user_rating)
                user_courses.append((course['id'], rating, timestamp))
                
#             for course in courses:
#                 print(course)
#                 rating = generate_rating()
#                 timestamp = generate_timestamp()
#                 user_rating = {'userId': i, 'courseId': course['id'], 'rating': rating, 'timestamp': timestamp}
#                 user_ratings.append(user_rating)
#                 user_courses.append((course['id'], rating, timestamp))
                    
        # Compile the user profile into a dictionary
        user_profile = {'id':i, 'name': name, 'age': age, 'interests': user_interests, 'courses': user_courses}
        users.append(user_profile)
    return users, user_ratings, rated_courses

In [None]:
# Generate 10 fake user profiles
users, user_ratings, rated_courses = generate_user_profiles(50000)

# Convert the user profiles to a DataFrame
df_users = pd.DataFrame(users, columns=['id','name', 'age','interests','courses'])
df_user_ratings = pd.DataFrame(user_ratings, columns=['userId','courseId','rating','timestamp'])

# Print the DataFrame
# print(df_users)
df_user_ratings.head()

In [None]:
df_users.to_excel('User_Profiles.xlsx')
df_user_ratings.to_excel('User_Ratings.xlsx')
# rated_courses.to_excel('Rated_Courses.xlsx', engine='xlsxwriter')

In [None]:
rated_courses.to_excel('Rated_Courses.xlsx', engine='xlsxwriter')