In [30]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
data = pd.read_csv('/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/Cleaned Movie Data.csv')

In [31]:
num_users = 5000
min_watched = 10
max_watched = 50
like_threshold = 7


In [32]:
def generate_user_profiles(movies, num_users):
    user_data = []
    
    for user_id in range(1, num_users + 1):
        # Randomly select number of watched movies
        num_watched = random.randint(min_watched, max_watched)
        watched_movies = movies.sample(num_watched)

        # Determine liked and disliked movies based on vote_average
        liked_movies = watched_movies[watched_movies['vote_average'] > like_threshold]
        disliked_movies = watched_movies[watched_movies['vote_average'] <= like_threshold]
        liked_movie_ids = liked_movies['movie_id'].tolist()
        disliked_movie_ids = disliked_movies['movie_id'].tolist()

        # Extract genres from liked and disliked movies
        all_liked_genres = ','.join(liked_movies['genres'].tolist()).split(',')
        liked_genres = list(set([genre.strip() for genre in all_liked_genres if genre]))
        
        all_disliked_genres = ','.join(disliked_movies['genres'].tolist()).split(',')
        disliked_genres = list(set([genre.strip() for genre in all_disliked_genres if genre]))

        # Determine preferred language (most frequent one from watched movies)
        preferred_language = watched_movies['original_language'].mode()[0]

        # Generate synthetic timestamps for when movies were watched
        watch_timestamps = [
            (datetime.now() - timedelta(days=random.randint(1, 365))).strftime("%Y-%m-%d")
            for _ in range(num_watched)
        ]

        # Generate synthetic user ratings, higher for liked movies
        user_ratings = [
            random.randint(7, 10) if movie in liked_movie_ids else random.randint(1, 6)
            for movie in watched_movies['movie_id'].tolist()
        ]

        # Define viewing frequency based on number of movies watched
        viewing_frequency = "High" if num_watched > 40 else "Medium" if num_watched > 20 else "Low"

        # Extract favorite production companies from liked movies
        fav_production_companies = (
            liked_movies['production_companies']
            .str.split(',')
            .explode()
            .mode()[0]
            if not liked_movies.empty else None
        )

        # Store user data
        user_data.append({
            'user_id': user_id,
            'watched_movies': watched_movies['movie_id'].tolist(),
            'liked_movies': liked_movie_ids,
            'disliked_movies': disliked_movie_ids,
            'liked_genres': liked_genres,
            'disliked_genres': disliked_genres,
            'language_preferred': preferred_language,
            'watch_timestamps': watch_timestamps,
            'user_ratings': user_ratings,
            'viewing_frequency': viewing_frequency,
            'fav_production_companies': fav_production_companies
        })

    return pd.DataFrame(user_data)

In [33]:
user_profiles = generate_user_profiles(movies, num_users)
user_profiles.head(10)

Unnamed: 0,user_id,watched_movies,liked_movies,disliked_movies,liked_genres,disliked_genres,language_preferred,watch_timestamps,user_ratings,viewing_frequency,fav_production_companies
0,1,"[707, 1563, 9085, 970308, 606236, 4281, 711017...","[1563, 9441, 29592, 664416, 309809, 84892, 107...","[707, 9085, 970308, 606236, 4281, 711017, 6912...","[Music, War, Fantasy, Documentary, Comedy, Dra...","[Fantasy, Comedy, Drama, Family, Horror, Anima...",en,"[2024-03-26, 2024-10-31, 2024-11-23, 2024-07-0...","[2, 7, 5, 6, 1, 6, 3, 1, 3, 8, 7, 3, 8, 1, 10,...",Medium,Agencja Produkcji Filmowej
1,2,"[196867, 10710, 346910, 9605, 39217, 554022, 8...","[1417, 745881, 838209, 1050035, 79, 643, 21575...","[196867, 10710, 346910, 9605, 39217, 554022, 8...","[War, Fantasy, Comedy, Western, Drama, Horror,...","[War, Fantasy, Comedy, Drama, Family, Science ...",en,"[2025-01-09, 2024-06-15, 2024-07-06, 2025-03-1...","[4, 6, 6, 6, 1, 3, 3, 10, 5, 10, 10, 3, 7, 2, ...",Medium,AOI Pro.
2,3,"[294016, 425, 1263421, 21191, 6466, 964877, 12...","[294016, 425, 1263421, 21191, 26963, 23483, 96...","[6466, 964877, 1257, 988762, 1443168, 2698, 22...","[Music, Fantasy, Comedy, Drama, Family, Thrill...","[Fantasy, Comedy, Drama, Family, Science Ficti...",en,"[2024-04-25, 2024-03-22, 2024-12-14, 2025-03-1...","[10, 10, 8, 7, 4, 5, 1, 6, 4, 5, 5, 6, 9, 9, 3...",Medium,20th Century Fox
3,4,"[5516, 11615, 1165466, 673309, 11673, 2293, 64...","[11615, 1165466, 673309, 11673, 64682, 11209, ...","[5516, 2293, 479, 892515, 1300962, 43074, 9456...","[Music, War, Western, Comedy, Drama, Family, S...","[Fantasy, Comedy, Drama, Family, Horror, Anima...",en,"[2024-05-18, 2024-09-20, 2024-11-12, 2024-11-0...","[3, 9, 10, 9, 10, 3, 7, 8, 1, 8, 6, 9, 1, 1, 2...",Low,A+E Global Media
4,5,"[13005, 11599, 1019836, 58595, 335797, 600479,...","[11599, 335797, 11209, 467909, 661914, 770, 10...","[13005, 1019836, 58595, 600479, 589761, 635237...","[Music, War, Western, Comedy, Documentary, Fan...","[Music, War, Fantasy, Comedy, Documentary, Dra...",en,"[2024-10-04, 2024-09-26, 2025-03-04, 2025-03-0...","[3, 9, 4, 3, 9, 3, 1, 4, 6, 5, 6, 6, 6, 9, 1, ...",Medium,5000 Broadway Productions
5,6,"[12103, 273248, 955, 1434457, 844185, 487670, ...","[273248, 487670, 697843, 576, 2270, 803, 47555...","[12103, 955, 1434457, 844185, 10168, 827931, 2...","[Adventure, Western, Fantasy, Documentary, Com...","[Fantasy, Documentary, Comedy, Drama, TV Movie...",en,"[2024-10-13, 2024-03-25, 2025-01-21, 2025-02-0...","[3, 8, 2, 4, 6, 8, 7, 3, 6, 10, 3, 9, 8, 3, 4,...",High,Illumination
6,7,"[15877, 986280, 11072, 11797, 7511, 894246, 96...","[986280, 11072, 11797, 609, 587092, 877957, 68...","[15877, 7511, 894246, 9621, 522444, 10219, 490...","[War, Western, Comedy, Fantasy, Drama, TV Movi...","[Music, War, Fantasy, Comedy, Drama, Family, S...",en,"[2025-02-06, 2024-08-19, 2024-09-01, 2024-05-1...","[5, 8, 7, 7, 4, 5, 5, 4, 1, 8, 4, 6, 3, 5, 6, ...",High,Abano Productions
7,8,"[1104171, 964426, 895659, 7450, 280092, 7548, ...","[11524, 861072, 297270, 3176, 457, 146, 799583...","[1104171, 964426, 895659, 7450, 280092, 7548, ...","[Music, War, Comedy, Documentary, Drama, Famil...","[Music, War, Fantasy, Comedy, Drama, Family, S...",en,"[2025-01-09, 2024-05-03, 2025-03-15, 2024-05-2...","[2, 2, 6, 3, 2, 1, 8, 1, 9, 7, 7, 2, 10, 7, 1,...",High,20th Century Fox
8,9,"[432011, 632617, 5851, 500268, 5481, 606, 9604...","[432011, 632617, 606, 960481, 20352, 504253, 8...","[5851, 500268, 5481, 13346, 628922, 455957, 43...","[Music, Fantasy, Comedy, Drama, Family, Histor...","[War, Fantasy, Comedy, Documentary, Drama, Fam...",en,"[2024-03-22, 2024-08-07, 2024-11-06, 2024-09-0...","[8, 9, 3, 5, 5, 8, 9, 2, 6, 2, 8, 9, 2, 1, 4, ...",High,jeki
9,10,"[2086, 683340, 739643, 27053, 42006, 876969, 2...","[27053, 42006]","[2086, 683340, 739643, 876969, 2454, 987490, 2...","[History, Drama, Crime, Romance]","[Fantasy, Comedy, Documentary, Drama, Family, ...",en,"[2024-03-29, 2024-03-22, 2024-04-27, 2024-05-0...","[1, 1, 3, 8, 7, 6, 3, 4, 3, 3, 2, 4, 2, 1]",Low,Productions Sigma


In [34]:
user_profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   user_id                   5000 non-null   int64 
 1   watched_movies            5000 non-null   object
 2   liked_movies              5000 non-null   object
 3   disliked_movies           5000 non-null   object
 4   liked_genres              5000 non-null   object
 5   disliked_genres           5000 non-null   object
 6   language_preferred        5000 non-null   object
 7   watch_timestamps          5000 non-null   object
 8   user_ratings              5000 non-null   object
 9   viewing_frequency         5000 non-null   object
 10  fav_production_companies  4987 non-null   object
dtypes: int64(1), object(10)
memory usage: 429.8+ KB


In [35]:
user_profiles.to_csv('/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/User_Profiles.csv', index=False)