In [52]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sys
movies=pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")


In [53]:
def load_data(movies_df, ratings_df):
    # Load the data from CSV files
    movies = movies_df.copy()
    ratings = ratings_df.copy()

    # Print the columns to debug
    print("Movies DataFrame columns:", movies.columns)

    # Check if 'title' exists in movies DataFrame
    if 'title' not in movies.columns:
        raise KeyError("The 'title' column is missing in the movies DataFrame.")

    # Create a list of genres for binary encoding
    genres = [
        'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
        'Crime', 'Documentary', 'Drama', 'Fantasy', 'Horror',
        'Mystery', 'Romance', 'Sci-Fi', 'Thriller'
    ]

    # Prepare the movies DataFrame
    # Extract year from title and compute average ratings
    movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')
    movies['title'] = movies['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True).str.strip()

    # Calculate average ratings for each movie
    avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
    avg_ratings.columns = ['movieId', 'avg_rating']

    # Merge the average ratings with movies DataFrame
    movies = movies.merge(avg_ratings, on='movieId', how='left')

    # One-hot encode genres into binary columns
    for genre in genres:
        movies[genre] = movies['genres'].str.contains(genre).astype(int)

    # Drop the original genres column
    movies = movies.drop(columns=['genres', 'title'], axis=1)

    # Prepare the user DataFrame
    user_data = ratings.groupby('userId').agg(
        rating_count=('rating', 'count'),
        avg_rating=('rating', 'mean')
    ).reset_index()

    # Calculate average ratings for each genre per user
    for genre in genres:
        user_data[genre + '_avg'] = \
        ratings[ratings['movieId'].isin(movies[movies[genre] == 1]['movieId'])].groupby('userId')[
            'rating'].mean().fillna(0)

    # Merge user data with the binary genre data
    user_data = user_data.fillna(0)
    # Return user data, movies, and the original movies DataFrame
    return user_data, movies, movies





In [54]:
def get_user_data():
    ratings_df = pd.read_csv('ratings.csv')  # Adjust the filename as needed
    lastid = ratings_df['userId'].iloc[-1]
    user_data = []

    # Collecting user input for the ratings
    user_data.append(int(lastid + 1))
    user_data.append(float(input("Enter the Average rating: ")))
    user_data.append(float(input("Enter the rating of Action: ")))
    user_data.append(float(input("Enter the rating of Adventure: ")))
    user_data.append(float(input("Enter the rating of Animation: ")))
    user_data.append(float(input("Enter the rating of Children: ")))
    user_data.append(float(input("Enter the rating of Comedy: ")))
    user_data.append(float(input("Enter the rating of Crime: ")))
    user_data.append(float(input("Enter the rating of Documentary: ")))
    user_data.append(float(input("Enter the rating of Drama: ")))
    user_data.append(float(input("Enter the rating of Fantasy: ")))
    user_data.append(float(input("Enter the rating of Horror: ")))
    user_data.append(float(input("Enter the rating of Mystery: ")))
    user_data.append(float(input("Enter the rating of Romance: ")))

    return user_data

In [55]:
def train_model(X_train, y_train, X_test, y_test):
    # Neural networks for user and movie vectors
    user_NN = tf.keras.models.Sequential([
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(32),
    ])

    movie_NN = tf.keras.models.Sequential([
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(32),
    ])

    # Inputs for users and movies
    user_input = tf.keras.Input(shape=(X_train.shape[1],))  # Shape (features,)
    movie_input = tf.keras.Input(shape=(y_train.shape[1],))  # Shape (features,)

    # Get user and movie vectors
    user_vector = user_NN(user_input)
    movie_vector = movie_NN(movie_input)

    # Compute dot product
    output = tf.keras.layers.Dot(axes=1)([user_vector, movie_vector])

    # Build and compile the model (Don't redefine it later)
    model = tf.keras.Model(inputs=[user_input, movie_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss=tf.keras.losses.MeanSquaredError())

    # Train the model
    model.fit([X_train, y_train], y_train, epochs=10, batch_size=32)

    # Evaluate the model (Fix the input format)
    loss = model.evaluate([X_test, y_test], y_test)
    print(f"Test Loss: {loss}")

    return model


In [56]:
def recommend_movie(model, user_data, movies_scaled, movies_original):
    user_vector = np.expand_dims(user_data, axis=0)  # Shape (1, 17)
    
    # Repeat user_vector to match movies_scaled shape
    user_vector_repeated = np.repeat(user_vector, movies_scaled.shape[0], axis=0)  # Shape (86537, 17)
    print("User vector shape:", user_vector_repeated.shape)  # Should be (86537, 17)
    print("Movies scaled shape:", movies_scaled.shape)  # Should be (86537, 17)
    if user_vector_repeated.shape[1] != 17:
        missing_columns = 17 - user_vector_repeated.shape[1]
        user_vector_repeated = np.pad(user_vector_repeated, ((0, 0), (0, missing_columns)), mode='constant')
    if movies_scaled.shape[1] != 17:
        missing_columns = 17 - movies_scaled.shape[1]
        movies_scaled = np.pad(movies_scaled, ((0, 0), (0, missing_columns)), mode='constant')

    # Predict ratings
    predicted_ratings = model.predict([user_vector_repeated, movies_scaled])

    # Get the index of the highest predicted rating
    top_movie_index = np.argmax(predicted_ratings)

    # Get the recommended movie
    recommended_movie = movies_original.iloc[top_movie_index]
    

    return recommended_movie


In [57]:
def train_test(movies_scaled,user_scaled):
    # Ensure movies_scaled and user_scaled have the same length
    min_samples = min(len(movies_scaled), len(user_scaled))

    movies_scaled = movies_scaled[:min_samples]
    user_scaled = user_scaled[:min_samples]
    X_train, X_test = train_test_split(user_scaled, test_size=0.4, random_state=42)
#Movies data is used for both training and testing
    y_train, y_test = movies_scaled[:len(X_train)], movies_scaled[len(X_train):]
# Re-split correctly
    print("X_train:", X_train.shape, "y_train:", y_train.shape)
    print("X_test:", X_test.shape, "y_test:", y_test.shape)
    X_train = X_train.iloc[:-17307]
    y_train=y_train.iloc[:-17307]
    return X_train, y_train, X_test, y_test




In [58]:
#running
user_data, movies_scaled, movies_original=load_data(movies,ratings)
X_train, y_train, X_test, y_test=train_test(movies_scaled,user_data)
model = train_model(X_train, y_train, X_test, y_test)
# Get user input data
user_data = get_user_data()
# Recommend movie based on trained model
recommend_movie(model, user_data, movies_scaled, movies_original)

Movies DataFrame columns: Index(['movieId', 'title', 'genres'], dtype='object')
X_train: (51922, 17) y_train: (51922, 17)
X_test: (34615, 17) y_test: (34615, 17)
Epoch 1/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: nan
Epoch 2/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: nan
Epoch 3/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: nan
Epoch 4/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: nan
Epoch 5/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: nan
Epoch 6/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: nan
Epoch 7/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: nan
Epoch 8/10
[1m1082/1082[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: nan
Epoch 9/10
[1m1082/1082[0m [32m━━━━

Enter the Average rating:  4
Enter the rating of Action:  4
Enter the rating of Adventure:  4
Enter the rating of Animation:  2
Enter the rating of Children:  2
Enter the rating of Comedy:  2
Enter the rating of Crime:  2
Enter the rating of Documentary:  2
Enter the rating of Drama:  4
Enter the rating of Fantasy:  3
Enter the rating of Horror:  4
Enter the rating of Mystery:  4
Enter the rating of Romance:  2


User vector shape: (86537, 14)
Movies scaled shape: (86537, 17)
[1m2705/2705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


movieId               1
year               1995
avg_rating     3.893508
Action                0
Adventure             1
Animation             1
Children              1
Comedy                1
Crime                 0
Documentary           0
Drama                 0
Fantasy               1
Horror                0
Mystery               0
Romance               0
Sci-Fi                0
Thriller              0
Name: 0, dtype: object

In [63]:
# Find the movie name
movie_name = movies.loc[movies['movie_id'] == movie_id, 'movie_name']
print("Recommended Movie:", movie_name)

KeyError: 'movie_id'