# Predict popularity based on genre and keywords

In [71]:
import tmdb15k.dataset as dataset
import tmdb15k.util as util

df = dataset.load()
df = util.clean(df)
df = util.remove_non_english(df)
df = util.keep_columns(df, ['popularity', 'genres', 'keywords'])
df = df[df['popularity'] <= 100] # Anything above 100 is an outlier.

In [72]:
import pandas as pd
import json

# Step 1: Parse JSON into Python list
df['genres'] = df['genres'].apply(lambda x: json.loads(x.replace("'", '"')))

# Step 2: Convert Python list into string of genre names separated by '|'
df['genres'] = df['genres'].apply(lambda x: '|'.join([i['name'].lower().replace(' ', '_') for i in x]))

# Step 3: Convert genre string into one-hot encoding
genres_df = df['genres'].str.get_dummies(sep='|')

# Step 4: Add prefix 'genre_' to each column
genres_df.columns = ['genre_' + str(col) for col in genres_df.columns]

# Step 5: Concatenate the one-hot encoded genres back to the original dataframe
df = pd.concat([df, genres_df], axis=1)

In [73]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert strings to lists of keywords
df['keywords'] = df['keywords'].apply(lambda x: [i.lower().replace(' ', '_') for i in x])

# Initialize MultiLabelBinarizer
keywords_mlb = MultiLabelBinarizer()

# Transform the keywords
keywords_encoded = keywords_mlb.fit_transform(df['keywords'])

# Get the keyword names
keyword_names = keywords_mlb.classes_

# Create a DataFrame
keywords_df = pd.DataFrame(keywords_encoded, columns=["keyword_"+name for name in keyword_names])

# Concatenate the keywords DataFrame with the original DataFrame
df = pd.concat([df, keywords_df], axis=1)


In [74]:
df = df.fillna(0)

In [75]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming your DataFrame is named df, and the 'popularity' column is the target

# Features and target
X = df.drop(['popularity', 'genres', 'keywords'], axis=1)
y = df['popularity']

# Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize linear regression model
lm = LinearRegression()

# Fit the model on training data
lm.fit(X_train, y_train)

# Predict on the testing data
y_pred = lm.predict(X_test)

# Evaluate model using RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 1312679115.918834
