In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("/kaggle/input/iisc-assignment/p1_movie_metadata.csv")

In [3]:
duplicates = data.duplicated() #this will give true for the duplicate rows and false for the remianing
duplicated_rows = data[duplicates] # select the rows wit True
# print(len(duplicated_rows))
df = data.drop_duplicates(keep="first")  # dropping the duplicates from ou data
df = df.dropna(subset=["director_name","title_year"])


In [4]:
dataframe = df[["director_name","genres","title_year"]].copy()

In [5]:
dataframe.head()

Unnamed: 0,director_name,genres,title_year
0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,2009.0
1,Gore Verbinski,Action|Adventure|Fantasy,2007.0
2,Sam Mendes,Action|Adventure|Thriller,2015.0
3,Christopher Nolan,Action|Thriller,2012.0
5,Andrew Stanton,Action|Adventure|Sci-Fi,2012.0


In [6]:
sorted_df = dataframe.sort_values(by=['director_name', 'title_year'])
sorted_df['next_title_year'] = sorted_df.groupby('director_name')['title_year'].shift(-1)
sorted_df['next_genres'] = sorted_df.groupby('director_name')['genres'].shift(-1)
sorted_df.dropna(inplace=True,axis=0)

In [7]:
sorted_df= sorted_df.reset_index(drop=True)

In [8]:
sorted_df.head()

Unnamed: 0,director_name,genres,title_year,next_title_year,next_genres
0,Adam McKay,Comedy,2004.0,2006.0,Action|Comedy|Sport
1,Adam McKay,Action|Comedy|Sport,2006.0,2008.0,Comedy
2,Adam McKay,Comedy,2008.0,2010.0,Action|Comedy|Crime
3,Adam McKay,Action|Comedy|Crime,2010.0,2013.0,Comedy
4,Adam McKay,Comedy,2013.0,2015.0,Biography|Comedy|Drama|History


In [9]:
sorted_df["next_genres"] = sorted_df["next_genres"].apply(lambda x:x.split("|") )
unique_genres = set(genre for genre_list in sorted_df["next_genres"] for genre in genre_list)
# Define a function to mark genres with 1 or 0
def mark_genres(row):
    genre_marked = {genre: 1 if genre in row['next_genres'] else 0 for genre in unique_genres}
    return pd.Series(genre_marked)

# Apply the function to each row and concatenate with the original DataFrame
genre_marked_df = sorted_df.apply(mark_genres, axis=1)
encoded_df = pd.concat([sorted_df, genre_marked_df], axis=1)

In [10]:
encoded_df.head()

Unnamed: 0,director_name,genres,title_year,next_title_year,next_genres,War,Musical,News,Thriller,Documentary,...,Biography,Adventure,Western,Horror,Drama,Sport,Animation,Fantasy,Sci-Fi,Mystery
0,Adam McKay,Comedy,2004.0,2006.0,"[Action, Comedy, Sport]",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,Adam McKay,Action|Comedy|Sport,2006.0,2008.0,[Comedy],0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adam McKay,Comedy,2008.0,2010.0,"[Action, Comedy, Crime]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Adam McKay,Action|Comedy|Crime,2010.0,2013.0,[Comedy],0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adam McKay,Comedy,2013.0,2015.0,"[Biography, Comedy, Drama, History]",0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [11]:
encoded_df.drop(["genres","next_genres"], inplace=True, axis=1)

In [12]:
encoded_df.head()

Unnamed: 0,director_name,title_year,next_title_year,War,Musical,News,Thriller,Documentary,History,Music,...,Biography,Adventure,Western,Horror,Drama,Sport,Animation,Fantasy,Sci-Fi,Mystery
0,Adam McKay,2004.0,2006.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,Adam McKay,2006.0,2008.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adam McKay,2008.0,2010.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Adam McKay,2010.0,2013.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adam McKay,2013.0,2015.0,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0


In [13]:
encoded_df.columns[3:-2]

Index(['War', 'Musical', 'News', 'Thriller', 'Documentary', 'History', 'Music',
       'Comedy', 'Crime', 'Family', 'Action', 'Romance', 'Film-Noir',
       'Biography', 'Adventure', 'Western', 'Horror', 'Drama', 'Sport',
       'Animation', 'Fantasy'],
      dtype='object')

In [22]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Assuming 'genres' is a list of genre columns in your DataFrame
genres = list(encoded_df.columns[3:-2])  # Exclude the last two columns which are not genre flags

# Normalize 'title_year' and 'next_title_year'
max_year = encoded_df['title_year'].max()
max_next_year = encoded_df['next_title_year'].max()
encoded_df['title_year_normalized'] = encoded_df['title_year'] / max_year
encoded_df['next_title_year_normalized'] = encoded_df['next_title_year'] / max_next_year

# Tokenize the director names
tokenizer = Tokenizer()
tokenizer.fit_on_texts(encoded_df['director_name'])
sequences = tokenizer.texts_to_sequences(encoded_df['director_name'])
director_sequences = pad_sequences(sequences, maxlen=100)  # Adjust maxlen as needed

# Prepare the data
X_title_year = encoded_df['title_year_normalized'].values.reshape(-1, 1)
X_director = director_sequences
Y_genres = encoded_df[genres].values
y_year = encoded_df['next_title_year_normalized'].values.reshape(-1, 1)

# Split the data into training and testing sets
X_train_director, X_test_director, X_train_title_year, X_test_title_year, Y_train_genres, Y_test_genres, y_train_year, y_test_year = train_test_split(
    X_director, X_title_year, Y_genres, y_year, test_size=0.2, random_state=42
)

# Shared embedding layer for director's name
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because tokens are 1-indexed
embedding_dim = 50  # Example embedding dimension

input_director = Input(shape=(X_train_director.shape[1],))
embedding_director = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_director)
flatten_director = Flatten()(embedding_director)

# Input layer for title year (continuous variable)
input_title_year = Input(shape=(1,))

# Concatenate the outputs of the shared embedding layers and the title year input
concatenated = concatenate([flatten_director, input_title_year])

# Dense layers for each task
dense_genres = Dense(128, activation='relu')(concatenated)
output_genres = Dense(len(genres), activation='sigmoid', name='genres_output')(dense_genres)

dense_year = Dense(128, activation='relu')(concatenated)
output_year = Dense(1, activation='linear', name='year_output')(dense_year)

# Combined model
model = Model(inputs=[input_director, input_title_year], outputs=[output_genres, output_year])

# Compile the model with multiple losses for each output
model.compile(optimizer='adam',
              loss={'genres_output': 'binary_crossentropy', 'year_output': 'mean_squared_error'},
              metrics={'genres_output': 'accuracy', 'year_output': 'mean_absolute_error'})

# Train the model
model.fit([X_train_director, X_train_title_year], [Y_train_genres, y_train_year],
          validation_data=([X_test_director, X_test_title_year], [Y_test_genres, y_test_year]),
          epochs=10, batch_size=10)

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - genres_output_accuracy: 0.0498 - loss: 0.4351 - year_output_mean_absolute_error: 0.1136 - val_genres_output_accuracy: 0.0480 - val_loss: 0.3154 - val_year_output_mean_absolute_error: 0.0130
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - genres_output_accuracy: 0.0923 - loss: 0.3117 - year_output_mean_absolute_error: 0.0201 - val_genres_output_accuracy: 0.2380 - val_loss: 0.3083 - val_year_output_mean_absolute_error: 0.0519
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - genres_output_accuracy: 0.2691 - loss: 0.2921 - year_output_mean_absolute_error: 0.0438 - val_genres_output_accuracy: 0.3000 - val_loss: 0.2931 - val_year_output_mean_absolute_error: 0.0237
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - genres_output_accuracy: 0.3598 - loss: 0.2662 - year_output_mean_absolute_

<keras.src.callbacks.history.History at 0x7cf800d71f60>

In [16]:
# Evaluate the model
evaluation_results = model.evaluate(
    [X_test_director, X_test_title_year],
    [Y_test_genres, y_test_year]
)

# Extract the individual losses and metrics
genres_acc = evaluation_results[0]
genres_loss = evaluation_results[1]
year_mae = evaluation_results[2]

# Print the individual losses and metrics
print(f'Genres Loss: {genres_loss}, Genres Accuracy: {genres_acc}')
print(f'Year Mean Absolute Error: {year_mae}')

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - genres_output_accuracy: 0.3190 - loss: 0.3082 - year_output_mean_absolute_error: 0.0374
Genres Loss: 0.3059999942779541, Genres Accuracy: 0.2976096570491791
Year Mean Absolute Error: 0.03816793113946915


In [20]:
import numpy as np

# Given director's name
director_name = 'James Cameron'

# Tokenize the director's name
director_sequence = tokenizer.texts_to_sequences([director_name])
director_sequence = pad_sequences(director_sequence, maxlen=100)

# Given title year
title_year = 2022

# Normalize the title year
title_year_normalized = title_year / max_year

# Reshape the title year to match the input shape of the model
title_year_normalized = np.array([title_year_normalized]).reshape(1, 1)

# Make a prediction
genres_predictions, year_predictions = model.predict([director_sequence, title_year_normalized])

# Convert genres predictions to labels
genres_predicted_labels = [i for i, prob in enumerate(genres_predictions[0]) if prob > 0.5]

# Convert the labels back to genre names
genres_predicted_names = [genres[i] for i in genres_predicted_labels]

# Print the predicted genres and year
print("Predicted Genres:")
print(genres_predicted_names)
# print("\nPredicted Year:")
# print(year_predictions[0][0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predicted Genres:
['Action', 'Adventure']


(96,)