In [1]:
import pandas as pd
import numpy as np
from google.colab import drive

#Mount google drive to terminal

drive.mount('/content/gdrive')
%cd "gdrive/MyDrive/CIS 3920"

df_1 = pd.read_csv("tmdb_5000_movies.csv")
df_2 = pd.read_csv("TMDB_movie_dataset_v11.csv")

#print("Movie Dataset 1:\n")
#print(df_1.head())
#print("Movie Dataset 2:\n")
#print(df_2.head())

# Clean each dataset to only include relevant info

df_1 = df_1[['title', 'budget', 'original_language', 'release_date', 'revenue', 'runtime']]
#print("\nMovie Dataset 1:\n")
#print(df_1.head())
df_2 = df_2[['title', 'genres', 'adult']]
#print("\nMovie Dataset 2:\n")
#print(df_2.head())

# Merge datasets and clean

movies_df = pd.merge(df_1, df_2, on='title', how='left')
movies_df = movies_df.drop_duplicates(subset=['title'])
movies_df['is_adult?'] = np.where(
    movies_df['adult'].astype(str).str.lower() == 'True',
    1,
    0
)
movies_df.drop(columns=['adult'], inplace=True)
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')
movies_df['release_date'] = movies_df['release_date'].dt.month_name()
print("\nMovies Dataframe:\n")
#print(movies_df)

# Create new "success/failure" column

movies_df['is_success?'] = (movies_df['revenue'] >= (movies_df['budget'] * 2)).astype(int)
print(movies_df)




Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/CIS 3920

Movies Dataframe:

                                          title     budget original_language  \
0                                        Avatar  237000000                en   
6      Pirates of the Caribbean: At World's End  300000000                en   
7                                       Spectre  245000000                en   
17                        The Dark Knight Rises  250000000                en   
18                                  John Carter  260000000                en   
...                                         ...        ...               ...   
17769                               El Mariachi     220000                es   
17770                                 Newlyweds       9000                en   
17772                 Signed, Sealed, Delivered          0                en   
17773           

# Split all columns into 0/1 categories

## Genre into 0/1

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

# Created a function to make all the genres atttrubutes a 'list' datatype in order for MLB^^^ to work

def clean_genres(x):
    if isinstance(x, str):
        return [g.strip() for g in x.split(',') if g.strip()]
    else:
        return []

movies_df['genres'] = movies_df['genres'].apply(clean_genres)

movies_df['genres'].apply(type).value_counts()
movies_df



Unnamed: 0,title,budget,original_language,release_date,revenue,runtime,genres,is_adult?,is_success?
0,Avatar,237000000,en,December,2787965087,162.0,"[Action, Adventure, Fantasy, Science Fiction]",0,1
6,Pirates of the Caribbean: At World's End,300000000,en,May,961000000,169.0,"[Adventure, Fantasy, Action]",0,1
7,Spectre,245000000,en,October,880674609,148.0,"[Action, Adventure, Thriller]",0,1
17,The Dark Knight Rises,250000000,en,July,1084939099,165.0,"[Action, Crime, Drama, Thriller]",0,1
18,John Carter,260000000,en,March,284139100,132.0,"[Action, Adventure, Science Fiction]",0,0
...,...,...,...,...,...,...,...,...,...
17769,El Mariachi,220000,es,September,2040920,81.0,"[Action, Crime, Thriller]",0,1
17770,Newlyweds,9000,en,December,0,85.0,"[Comedy, Romance, Drama]",0,0
17772,"Signed, Sealed, Delivered",0,en,October,0,120.0,"[Comedy, Drama, Romance, TV Movie]",0,1
17773,Shanghai Calling,0,en,May,0,98.0,"[Romance, Comedy, Drama]",0,1


In [3]:
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies_df['genres'])

genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
movies_df = movies_df.join(genres_df)
movies_df = movies_df.drop(columns=['genres'])
movies_df.head()

Unnamed: 0,title,budget,original_language,release_date,revenue,runtime,is_adult?,is_success?,Action,Adventure,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Avatar,237000000,en,December,2787965087,162.0,0,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,Pirates of the Caribbean: At World's End,300000000,en,May,961000000,169.0,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Spectre,245000000,en,October,880674609,148.0,0,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
17,The Dark Knight Rises,250000000,en,July,1084939099,165.0,0,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,John Carter,260000000,en,March,284139100,132.0,0,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## original_language 0/1

In [4]:
movies_df.insert(loc=8, column='is_english?', value=(movies_df['original_language'] == 'en').astype(int))
movies_df.drop(columns=['original_language'], inplace=True)
movies_df.head()

Unnamed: 0,title,budget,release_date,revenue,runtime,is_adult?,is_success?,is_english?,Action,Adventure,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Avatar,237000000,December,2787965087,162.0,0,1,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,Pirates of the Caribbean: At World's End,300000000,May,961000000,169.0,0,1,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Spectre,245000000,October,880674609,148.0,0,1,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
17,The Dark Knight Rises,250000000,July,1084939099,165.0,0,1,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,John Carter,260000000,March,284139100,132.0,0,0,1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Release