In [2]:
import pandas as pd
import numpy as np

import os
import google.generativeai as genai
import time
import json
from tqdm import tqdm
from google.colab import files

from sklearn.metrics.pairwise import cosine_similarity

os.environ["GEMINI_API_KEY"] = 'AIzaSyBtu2nvgatyStbeyYtO1VzocG3tKPrmhv0'
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel('gemini-2.0-flash')

In [None]:
movies = pd.read_csv('/content/movies_with_summaries_final_3.csv')
print(movies.info())
movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10342 entries, 0 to 10341
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieId        10342 non-null  int64  
 1   vectorID       10342 non-null  int64  
 2   title          10342 non-null  object 
 3   genres         10342 non-null  object 
 4   tags           10342 non-null  object 
 5   weight_rating  10332 non-null  float64
 6   imdbId         10342 non-null  object 
 7   page_content   10342 non-null  object 
 8   summary        10342 non-null  object 
 9   popularity     10342 non-null  float64
dtypes: float64(2), int64(2), object(6)
memory usage: 808.1+ KB
None


Unnamed: 0,movieId,vectorID,title,genres,tags,weight_rating,imdbId,page_content,summary,popularity
0,1,0,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy","adventure,animated,animation,cartoon,cgi,child...",3.904516,tt0114709,Movie's title: Toy Story (1995)\ngenres: Adven...,"Genres: Adventure,Animation,Children,Comedy,Fa...",0.701299
1,2,1,Jumanji (1995),"Adventure,Children,Fantasy","adventure,animals,big budget,childhood,childre...",3.214918,tt0113497,Movie's title: Jumanji (1995)\ngenres: Adventu...,"Genres: Adventure,Children,Fantasy | Themes: C...",0.285714
2,3,2,Grumpier Old Men (1995),"Comedy,Romance","comedy,good sequel,original,sequel,sequels",3.161619,tt0113228,Movie's title: Grumpier Old Men (1995)\ngenres...,"Genres: Comedy,Romance | Themes: Aging, Rivalr...",0.103896
3,4,3,Waiting to Exhale (1995),"Comedy,Drama,Romance","chick flick,girlie movie,romantic,unlikely fri...",2.990833,tt0114885,Movie's title: Waiting to Exhale (1995)\ngenre...,"Genres: Comedy,Drama,Romance | Themes: Female ...",0.038961
4,5,4,Father of the Bride Part II (1995),Comedy,"comedy,destiny,family,father daughter relation...",3.08397,tt0113041,Movie's title: Father of the Bride Part II (19...,"Genres: Comedy | Themes: Family, Parenthood, C...",0.103896


# Movie Summarize

In [None]:
# 1. Batch Prompt Construction (No Text Feature)
def create_batch_prompt(df_batch):
    """Create a batch prompt using title, genres, and tags"""
    prompt = """Analyze these movies and summarize each in this EXACT format:
    "Genres: [genres] | Themes: [3-5 themes] | Style: [2-3 style words] | Notable: [1-3 standout elements]"

    Input Data:\n"""

    for idx, row in df_batch.iterrows():
        prompt += (
            f"\nMovie ID: {idx}\n"
            f"Title: {row['title']}\n"
            f"Genres: {row['genres']}\n"
            f"User Tags: {row['tags'] if pd.notna(row['tags']) else 'None'}\n"
            "---"
        )
    return prompt + f"\n\nProvide exactly {len(df_batch)} summaries, one per line:"

def parse_response(text, expected_count):
    """Extract and validate summaries"""
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    return [
        line for line in lines
        if line.startswith("Genres:")
        and "| Themes:" in line
        and "| Style:" in line
    ][:expected_count]

# 1. Progress Tracking Setup
def setup_checkpoint():
    """Initialize or load checkpoint file"""
    checkpoint_file = 'progress_checkpoint.json'
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            return json.load(f)
    return {
        'last_completed_batch': -1,
        'processed_indices': []
    }

def save_checkpoint(batch_idx, processed_indices):
    """Save progress to checkpoint file"""
    with open('progress_checkpoint.json', 'w') as f:
        json.dump({
            'last_completed_batch': batch_idx,
            'processed_indices': processed_indices
        }, f)

# 2. Batch Processing with Persistence
def process_with_persistence(df, batch_size=20, delay=10):
    """Process movies with persistent progress tracking"""
    checkpoint = setup_checkpoint()
    df['summary'] = pd.NA  # Initialize all as null

    # Create temp file for incremental saves
    temp_output = 'movies_temp_results.csv'
    if os.path.exists(temp_output):
        df = pd.read_csv(temp_output)

    # Determine where to resume
    start_idx = (checkpoint['last_completed_batch'] + 1) * batch_size
    processed_indices = set(checkpoint['processed_indices'])

    for i in tqdm(range(start_idx, len(df), batch_size),
                 desc="Processing",
                 initial=start_idx//batch_size,
                 total=len(df)//batch_size):
        batch = df.iloc[i:i+batch_size]

        try:
            prompt = create_batch_prompt(batch)
            response = model.generate_content(prompt)

            if response.text:
                summaries = parse_response(response.text, len(batch))
                if len(summaries) == len(batch):
                    # Update DataFrame
                    df.loc[batch.index, 'summary'] = summaries
                    processed_indices.update(batch.index.tolist())

                    # Save progress after each successful batch
                    df.to_csv(temp_output, index=False)
                    save_checkpoint(i//batch_size, list(processed_indices))
                else:
                    print(f"Partial batch {i//batch_size}, saving progress")
            time.sleep(delay)

        except Exception as e:
            print(f"Error in batch {i//batch_size}: {str(e)[:100]}...")
            # Save progress even after errors
            df.to_csv(temp_output, index=False)
            time.sleep(delay*2)  # Longer delay after errors
            continue

    # Final cleanup
    if os.path.exists('progress_checkpoint.json'):
        os.remove('progress_checkpoint.json')
    if os.path.exists(temp_output):
        os.rename(temp_output, 'movies_with_summaries_final.csv')
        files.download('movies_with_summaries_final.csv')

    return df

# 3. Resume Functionality
def resume_processing(df):
    """Resume from last checkpoint"""
    if os.path.exists('progress_checkpoint.json'):
        checkpoint = setup_checkpoint()
        print(f"Resuming from batch {checkpoint['last_completed_batch'] + 1}")
        return process_with_persistence(df)
    return process_with_persistence(df)

# Execution
try:
    result_df = resume_processing(movies)
except KeyboardInterrupt:
    print("\nProcess interrupted. Progress saved. Run again to resume.")
except Exception as e:
    print(f"Fatal error: {str(e)}")
finally:
    # Ensure temp files are cleaned up if completed
    if os.path.exists('movies_with_summaries_final.csv'):
        print("Processing completed successfully!")

Processing:  11%|█         | 58/517 [13:51<1:52:43, 14.73s/it]

Partial batch 58, saving progress


Processing:  19%|█▉        | 100/517 [23:48<1:38:16, 14.14s/it]

Partial batch 100, saving progress


Processing:  36%|███▌      | 187/517 [44:23<1:17:46, 14.14s/it]

Partial batch 187, saving progress


Processing:  58%|█████▊    | 302/517 [1:11:37<51:17, 14.31s/it]

Partial batch 302, saving progress


Processing: 518it [2:03:01, 14.25s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing completed successfully!


### Null processing after summarizer (Optional)

In [None]:
null_movies_id = movies['movieId'].values
movies[movies['summary'].isnull()]

Unnamed: 0,movieId,vectorID,title,genres,tags,weight_rating,imdbId,page_content,summary


In [None]:
null_movies = movies[movies['summary'].isnull()]
movies_with_colon_in_tags = null_movies[null_movies['tags'].str.contains(':', na=False)]
movies_with_colon_in_tags

Unnamed: 0,movieId,vectorID,title,genres,tags,weight_rating,imdbId,page_content,summary


In [None]:
# # prompt: get all movie in null_movies_id has  ':' symbol in tags feature and replace it all to ' '

null_movies_id = movies[movies['summary'].isnull()]['movieId'].values
for movie_id in null_movies_id:
  movie_index = movies[movies['movieId'] == movie_id].index[0]
  if ':' in str(movies.loc[movie_index, 'tags']):
    movies.loc[movie_index, 'tags'] = str(movies.loc[movie_index, 'tags']).replace(':', ' ')


In [None]:


def parse_response(text, expected_count):
    """Extract and validate summaries"""
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    return [
        line for line in lines
        if line.startswith("Genres:")
    ][:expected_count]

def create_batch_prompt(df_batch):
    """Create a batch prompt using title, genres, and tags"""
    prompt = """Analyze 20 these movies and summarize each in this EXACT format:
    "Genres: [genres] | Themes: [3-5 themes] | Style: [2-3 style words] | Notable: [1-3 elements]"

    Input Data:\n"""

    for idx, row in df_batch.iterrows():
        prompt += (
            f"Title: {row['title']}\n"
            f"Genres: {row['genres']}\n"
            f"Tags: {row['tags'] if pd.notna(row['tags']) else 'None'}\n"
            "---"
        )
    #     print(f"\nMovie ID: {idx}\n")
    # print( f"\n\nProvide exactly {len(df_batch)} summaries, one per line:")
    return prompt + f"\n\nProvide EXACTLY {len(df_batch)} summaries, one per line:"

def process_null_batch(df, batch_size=20, delay=10):
    """Process movies in batches with error handling"""
    null_df = df[df['summary'].isnull()].copy()

    for i in tqdm(range(0, len(null_df), batch_size), desc="Processing batches"):
        print(f'[{i} - {i + batch_size} )')
        batch = null_df.iloc[i:i+batch_size]

        try:
            prompt = create_batch_prompt(batch)
            response = model.generate_content(prompt)

            # print(prompt)
            print("Result:")
            # print(response.text)
            if response.text:
                summaries = parse_response(response.text, len(batch))
                print(f'[{len(batch)} - {len(summaries)} )')
                if len(summaries) == len(batch):
                    df.loc[batch.index, 'summary'] = summaries
                else:
                    print(f"Warning: Got {len(summaries)} summaries for batch {i//batch_size}, setting to null")
                    df.loc[batch.index, 'summary'] = None
            else:
                print(f"Empty response for batch {i//batch_size}, setting to null")
                df.loc[batch.index, 'summary'] = None

        except Exception as e:
            print(f"Error processing batch {i//batch_size}: {str(e)[:100]}..., setting to null")
            df.loc[batch.index, 'summary'] = None

        time.sleep(delay)

    return df

movies = process_null_batch(movies, batch_size=20, delay=10)
movies[movies['movieId'].isin(null_movies_id)]

Processing batches:   0%|          | 0/4 [00:00<?, ?it/s]

[0 - 20 )
Result:
[20 - 20 )


Processing batches:  25%|██▌       | 1/4 [00:15<00:45, 15.15s/it]

[20 - 40 )
Result:
[20 - 20 )


Processing batches:  50%|█████     | 2/4 [00:29<00:28, 14.45s/it]

[40 - 60 )
Result:
[20 - 20 )


Processing batches:  75%|███████▌  | 3/4 [00:43<00:14, 14.24s/it]

[60 - 80 )
Result:
[20 - 20 )


Processing batches: 100%|██████████| 4/4 [00:57<00:00, 14.30s/it]


Unnamed: 0,movieId,vectorID,title,genres,tags,weight_rating,imdbId,page_content,summary
1160,1314,1160,Breathing Room (1996),Romance,"criterion,nocturnal,romantic comedy",3.264001,tt0115754,Movie's title: Breathing Room (1996)\ngenres: ...,"Genres: Romance | Themes: Love, Relationships,..."
1161,1315,1161,Paris Was a Woman (1995),Documentary,"criterion,documentary,gay,gay character,glbt,i...",3.270196,tt0114093,Movie's title: Paris Was a Woman (1995)\ngenre...,"Genres: Documentary | Themes: Female artists, ..."
1162,1317,1162,I'm Not Rappaport (1996),Comedy,life philosophy,3.251737,tt0116601,Movie's title: I'm Not Rappaport (1996)\ngenre...,"Genres: Comedy | Themes: Aging, Friendship, Li..."
1163,1318,1163,Blue Juice (1995),"Comedy,Drama","cult classic,destiny,fun movie,good soundtrack...",3.277918,tt0112537,Movie's title: Blue Juice (1995)\ngenres: Come...,"Genres: Comedy, Drama | Themes: Destiny, Relat..."
1164,1320,1164,Alien³ (a.k.a. Alien 3) (1992),"Action,Horror,Sci-Fi,Thriller","action,alien,alien invasion,aliens,allegory,bl...",3.108348,tt0103644,Movie's title: Alien³ (a.k.a. Alien 3) (1992)\...,"Genres: Action, Horror, Sci-Fi, Thriller | The..."
...,...,...,...,...,...,...,...,...,...
6055,7249,6055,Plaza Suite (1971),Comedy,"adultery,based on a play,comedy,dialogue,good ...",3.256371,tt0067589,Movie's title: Plaza Suite (1971)\ngenres: Com...,"Genres: Comedy | Themes: Marriage, Relationshi..."
6056,7250,6056,"Out of Towners, The (1970)",Comedy,"comedy,ethnic conflict,funniest movies,funny,m...",3.289030,tt0066193,"Movie's title: Out of Towners, The (1970)\ngen...","Genres: Comedy | Themes: Urban Chaos, Misfortu..."
6057,7252,6057,"Three Stooges in Orbit, The (1962)","Comedy,Sci-Fi","aliens,alter ego,cartoon,comedy,cool,dumb but ...",3.224684,tt0056580,"Movie's title: Three Stooges in Orbit, The (19...","Genres: Comedy, Sci-Fi | Themes: Space Explora..."
6058,7253,6058,It (1927),"Comedy,Romance","fight scenes,highly quotable,original,pornogra...",3.264183,tt0018033,"Movie's title: It (1927)\ngenres: Comedy,Roman...","Genres: Comedy, Romance | Themes: Attraction, ..."


In [None]:
movies.to_csv('movies_with_summaries_final_3.csv', index=False)

# User Summarize

In [None]:
ratings = pd.read_csv('/content/rating.csv')
print(ratings.info())
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
# prompt: get rating of user_id 1 - 99

# some_user_ratings = ratings[(ratings['userId'] >= 1) & (ratings['userId'] <= 1000)]
# print(len(some_user_ratings['userId'].unique()))
# some_user_ratings

370


Unnamed: 0,userId,movieId,rating
0,1,29,3.5
1,1,32,3.5
2,1,47,3.5
3,1,50,3.5
4,1,112,3.5
...,...,...,...
96588,998,4857,5.0
96589,998,4995,5.0
96590,998,5008,5.0
96591,998,5120,4.0


In [None]:
def get_user_preferences(user_id, ratings_df, movies_df, top_n=100):
    # Get user's ratings sorted by most recent first
    user_ratings = ratings_df[ratings_df['userId'] == user_id]

    # Take last 100 ratings (or all if <100)
    recent_ratings = user_ratings.head(min(top_n, len(user_ratings)))

    # Merge with movie summaries
    rated_movies = recent_ratings.merge(movies_df, on='movieId')

    # Prepare prompt
    prompt = f"""
    Analyze the following movie ratings and summaries to infer this user's preferences EXACTLY this format:
    Genres: [5-6 favorite genres] | Themes: [4-5 favorite themes] | Style: [3-4 stylistic preferences] | Notable: [2-3 standout elements they seem to enjoy]

    Rated Movies:
    """

    for _, row in rated_movies.iterrows():
        prompt += (
            f"\n- {row['title']} (Rating: {row['rating']}/5)\n"
            f"  Summary: {row['summary']}\n"
        )
    prompt += """
    Based on the above, write a one-line summary of the user's preferences.
    """

    return prompt

In [None]:
# Initialize DataFrame to store preferences
user_prefs_df = pd.DataFrame(columns=['userId', 'preference'])

# Process users in batches with rate limiting
BATCH_SIZE = 10
DELAY_SECONDS = 10

for user_id in tqdm(ratings['userId'].unique(), desc="Generating preferences"):
    try:
        # Generate prompt
        prompt = get_user_preferences(user_id, ratings, movies).strip()

        # Get LLM response (replace with your actual LLM call)
        response = model.generate_content(prompt)
        pref_summary = response.text.strip() if response.text else None

        # Store result
        user_prefs_df = pd.concat([
            user_prefs_df,
            pd.DataFrame({'userId': [user_id], 'preference': [pref_summary]})
        ], ignore_index=True)

        # Save progress every 10 users
        if user_id % 10 == 0:
            user_prefs_df.to_csv('user_preferences_checkpoint.csv', index=False)

        time.sleep(DELAY_SECONDS)

    except Exception as e:
        print(f"Error processing user {user_id}: {str(e)}")
        continue

# Final save
user_prefs_df.to_csv('user_preferences_final.csv', index=False)
files.download('user_preferences_final.csv')
print(f"Generated preferences for {len(user_prefs_df)} users")

Generating preferences:   0%|          | 0/115 [05:24<?, ?it/s]


KeyboardInterrupt: 

In [None]:
user_summaries = pd.read_csv('/content/user_preferences_final_2.csv')
print(user_summaries.info())
user_summaries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userId      115 non-null    int64 
 1   preference  115 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB
None


Unnamed: 0,userId,preference
0,1,"Genres: Action, Adventure, Sci-Fi, Fantasy, Th..."
1,3,"Genres: Crime, Drama, Sci-Fi, Thriller, Action..."
2,7,"Genres: Action, Adventure, Comedy, Drama, Roma..."
3,11,"Genres: Action, Sci-Fi, Adventure, Thriller, F..."
4,14,"Genres: Comedy, Drama, Romance, Adventure, Ani..."
...,...,...
110,331,"Genres: Drama, Thriller, Action, Romance, Crim..."
111,334,"Genres: Action, Thriller, Crime, Comedy, Drama..."
112,335,"Genres: Crime, Drama, Thriller, Mystery, Comed..."
113,337,"Genres: Action, Crime, Drama, Thriller, Comedy..."


# Movie Infor Generate

### Movie Score Generate

In [None]:
movie_stats = ratings.groupby('movieId').agg(
    vote_count=('rating', 'count'),
    vote_average=('rating', 'mean')
).reset_index()

# 2. Join with movie titles
movie_stats = movie_stats.merge(movies[['movieId', 'title']], on='movieId')

# 3. Compute C and m for weighted rating
C = movie_stats['vote_average'].mean()
# m = movie_stats['vote_count'].quantile(0.80)
m = 100

# 4. Filter movies with sufficient votes
qualified = movie_stats[movie_stats['vote_count'] >= m].copy()

# 5. Compute weighted average
v = qualified['vote_count']
R = qualified['vote_average']
qualified['weighted_average'] = (R * v + C * m) / (v + m)

# 6. Sort results
qualified = qualified.sort_values('weighted_average', ascending=False)

In [None]:
m

np.float64(1954.0)

In [None]:
qualified

Unnamed: 0,movieId,vote_count,vote_average,title,weighted_average
301,318,63366,4.446990,"Shawshank Redemption, The (1994)",4.445128
759,858,41355,4.364732,"Godfather, The (1972)",4.362080
49,50,47006,4.334372,"Usual Suspects, The (1995)",4.332103
499,527,50054,4.310175,Schindler's List (1993),4.308092
1072,1221,27398,4.275641,"Godfather: Part II, The (1974)",4.271966
...,...,...,...,...,...
7265,31698,467,1.252677,Son of the Mask (2005),1.607627
1562,1826,419,1.163484,Barney's Great Adventure (1998),1.568448
5559,6587,701,1.164051,Gigli (2003),1.426373
5486,6483,426,0.973005,From Justin to Kelly (2003),1.408792


## Popularity Generate

In [None]:
ratings = pd.read_csv('/content/rating.csv')
print(ratings.info())
ratings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Step 1: Count the number of ratings per movie
popularity_df = ratings.groupby('movieId').size().reset_index(name='rating_count')

# popularity_df['log_count'] = np.log1p(popularity_df['rating_count']) # Add 1 to avoid log(0)

# Step 2: Normalize the rating count to a 0-1 scale
scaler = MinMaxScaler()
popularity_df['popularity'] = scaler.fit_transform(popularity_df[['rating_count']])

# Step 3: Merge back with the movies dataframe
movies = movies.merge(popularity_df[['movieId', 'popularity']], on='movieId', how='left')

# Optional: Fill NaN values with 0 (movies with no ratings)
movies['popularity'] = movies['popularity'].fillna(0)
movies

Unnamed: 0,movieId,vectorID,title,genres,tags,weight_rating,imdbId,page_content,summary,popularity
0,1,0,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy","adventure,animated,animation,cartoon,cgi,child...",3.904516,tt0114709,Movie's title: Toy Story (1995)\ngenres: Adven...,"Genres: Adventure,Animation,Children,Comedy,Fa...",0.701299
1,2,1,Jumanji (1995),"Adventure,Children,Fantasy","adventure,animals,big budget,childhood,childre...",3.214918,tt0113497,Movie's title: Jumanji (1995)\ngenres: Adventu...,"Genres: Adventure,Children,Fantasy | Themes: C...",0.285714
2,3,2,Grumpier Old Men (1995),"Comedy,Romance","comedy,good sequel,original,sequel,sequels",3.161619,tt0113228,Movie's title: Grumpier Old Men (1995)\ngenres...,"Genres: Comedy,Romance | Themes: Aging, Rivalr...",0.103896
3,4,3,Waiting to Exhale (1995),"Comedy,Drama,Romance","chick flick,girlie movie,romantic,unlikely fri...",2.990833,tt0114885,Movie's title: Waiting to Exhale (1995)\ngenre...,"Genres: Comedy,Drama,Romance | Themes: Female ...",0.038961
4,5,4,Father of the Bride Part II (1995),Comedy,"comedy,destiny,family,father daughter relation...",3.083970,tt0113041,Movie's title: Father of the Bride Part II (19...,"Genres: Comedy | Themes: Family, Parenthood, C...",0.103896
...,...,...,...,...,...,...,...,...,...,...
10337,130578,10337,The Gunman (2015),"Action,Thriller","action,assassin,assassination,good action,real...",3.264228,tt2515034,Movie's title: The Gunman (2015)\ngenres: Acti...,"Genres: Action,Thriller | Themes: Conspiracy, ...",0.000000
10338,130840,10338,Spring (2015),"Horror,Romance,Sci-Fi","cinematography,creepy,horror,immortality,love ...",3.265785,tt3395184,"Movie's title: Spring (2015)\ngenres: Horror,R...","Genres: Horror,Romance,Sci-Fi | Themes: Immort...",0.000000
10339,131013,10339,Get Hard (2015),Comedy,"buddy movie,coen bros,comedy,crude humor,foul ...",3.262312,tt2561572,Movie's title: Get Hard (2015)\ngenres: Comedy...,"Genres: Comedy | Themes: Stereotypes, Prison, ...",0.000000
10340,131168,10340,Phoenix (2014),Drama,"betrayal,camp,cinematography,criterion,dramati...",3.265425,tt2764784,Movie's title: Phoenix (2014)\ngenres: Drama\n...,"Genres: Drama | Themes: identity, loss, surviv...",0.000000


In [None]:
popularity_df

Unnamed: 0,movieId,rating_count,popularity
0,1,55,0.701299
1,2,23,0.285714
2,3,9,0.103896
3,4,4,0.038961
4,5,9,0.103896
...,...,...,...
4883,115149,2,0.012987
4884,115569,1,0.000000
4885,118696,1,0.000000
4886,125916,1,0.000000


In [None]:
movies.to_csv('movies_with_popularity.csv', index=False)

In [None]:
movies1 = pd.read_csv('/content/movies_with_popularity.csv')
movies1

Unnamed: 0,movieId,vectorID,title,genres,tags,weight_rating,imdbId,page_content,summary,popularity
0,1,0,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy","adventure,animated,animation,cartoon,cgi,child...",3.904516,tt0114709,Movie's title: Toy Story (1995)\ngenres: Adven...,"Genres: Adventure,Animation,Children,Comedy,Fa...",0.701299
1,2,1,Jumanji (1995),"Adventure,Children,Fantasy","adventure,animals,big budget,childhood,childre...",3.214918,tt0113497,Movie's title: Jumanji (1995)\ngenres: Adventu...,"Genres: Adventure,Children,Fantasy | Themes: C...",0.285714
2,3,2,Grumpier Old Men (1995),"Comedy,Romance","comedy,good sequel,original,sequel,sequels",3.161619,tt0113228,Movie's title: Grumpier Old Men (1995)\ngenres...,"Genres: Comedy,Romance | Themes: Aging, Rivalr...",0.103896
3,4,3,Waiting to Exhale (1995),"Comedy,Drama,Romance","chick flick,girlie movie,romantic,unlikely fri...",2.990833,tt0114885,Movie's title: Waiting to Exhale (1995)\ngenre...,"Genres: Comedy,Drama,Romance | Themes: Female ...",0.038961
4,5,4,Father of the Bride Part II (1995),Comedy,"comedy,destiny,family,father daughter relation...",3.083970,tt0113041,Movie's title: Father of the Bride Part II (19...,"Genres: Comedy | Themes: Family, Parenthood, C...",0.103896
...,...,...,...,...,...,...,...,...,...,...
10337,130578,10337,The Gunman (2015),"Action,Thriller","action,assassin,assassination,good action,real...",3.264228,tt2515034,Movie's title: The Gunman (2015)\ngenres: Acti...,"Genres: Action,Thriller | Themes: Conspiracy, ...",0.000000
10338,130840,10338,Spring (2015),"Horror,Romance,Sci-Fi","cinematography,creepy,horror,immortality,love ...",3.265785,tt3395184,"Movie's title: Spring (2015)\ngenres: Horror,R...","Genres: Horror,Romance,Sci-Fi | Themes: Immort...",0.000000
10339,131013,10339,Get Hard (2015),Comedy,"buddy movie,coen bros,comedy,crude humor,foul ...",3.262312,tt2561572,Movie's title: Get Hard (2015)\ngenres: Comedy...,"Genres: Comedy | Themes: Stereotypes, Prison, ...",0.000000
10340,131168,10340,Phoenix (2014),Drama,"betrayal,camp,cinematography,criterion,dramati...",3.265425,tt2764784,Movie's title: Phoenix (2014)\ngenres: Drama\n...,"Genres: Drama | Themes: identity, loss, surviv...",0.000000


In [None]:
movies1[movies1['weight_rating'] == 0]

Unnamed: 0,movieId,vectorID,title,genres,tags,weight_rating,imdbId,page_content,summary,popularity


## User Summary Embedding

In [5]:
user_summary_df = pd.read_csv('/content/user_preferences_final_2.csv')
print(user_summary_df.info())
user_summary_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userId      115 non-null    int64 
 1   preference  115 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB
None


Unnamed: 0,userId,preference
0,1,"Genres: Action, Adventure, Sci-Fi, Fantasy, Th..."
1,3,"Genres: Crime, Drama, Sci-Fi, Thriller, Action..."
2,7,"Genres: Action, Adventure, Comedy, Drama, Roma..."
3,11,"Genres: Action, Sci-Fi, Adventure, Thriller, F..."
4,14,"Genres: Comedy, Drama, Romance, Adventure, Ani..."
...,...,...
110,331,"Genres: Drama, Thriller, Action, Romance, Crim..."
111,334,"Genres: Action, Thriller, Crime, Comedy, Drama..."
112,335,"Genres: Crime, Drama, Thriller, Mystery, Comed..."
113,337,"Genres: Action, Crime, Drama, Thriller, Comedy..."


In [6]:
def get_embedding(text):
    result = genai.embed_content(
      model="models/text-embedding-004", content=text,
      output_dimensionality=384
    )
    return result["embedding"]

In [8]:
# Embed and store in a new column
user_summary_df['embedding'] = user_summary_df['preference'].apply(get_embedding)
print(len(user_summary_df['embedding'][0]))

384


In [9]:
user_summary_df.to_csv('user_summary_embedding_2.csv', index=False)

In [11]:
user_summary_df[user_summary_df['userId'] == 1]['embedding'].values[0]


[-0.033861753,
 0.002185061,
 -0.02641135,
 0.0029701823,
 0.049203467,
 0.013682993,
 0.0817075,
 0.016345555,
 -0.023417251,
 0.031280182,
 -0.008884915,
 -0.007828238,
 0.10023846,
 0.015181078,
 0.0025741747,
 -0.025564479,
 -0.006269446,
 0.04131116,
 -0.05866827,
 -0.01496015,
 -0.002400761,
 -0.022931192,
 0.054499637,
 -0.034187127,
 -0.020968929,
 -0.012619067,
 -0.016255163,
 0.021056749,
 0.00202811,
 -0.0047017373,
 -0.0033264894,
 0.024647666,
 0.020097807,
 0.03648563,
 -0.023145165,
 -0.078616455,
 -0.0040596067,
 -0.022034883,
 0.05059964,
 -0.038352497,
 -0.03931639,
 0.013082364,
 -0.069995195,
 0.023429748,
 -0.0057229465,
 0.008242547,
 -0.070920706,
 -0.008951158,
 -0.03257941,
 0.018480007,
 -0.019696249,
 -0.004066818,
 -0.051038954,
 0.0033499203,
 0.020397265,
 -0.02580657,
 -0.033857014,
 -0.046508964,
 0.020518133,
 -0.019644972,
 -0.034570836,
 -0.038418725,
 -0.013835007,
 0.02450948,
 -0.013438923,
 -0.039030407,
 -0.093631946,
 0.031785283,
 0.018868443,
