## Predicting Movie Success

In [3]:
#Dependencies
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder


In [4]:
data = Path('movies_dataset_0.csv')
df = pd.read_csv(data)
df.head(10)

Unnamed: 0,imdb_id,title,runtime,overview,rated,imdb_votes,popularity,imdb_rating,rotten_tomatoes_rating,metacritic_rating,...,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2,release_year,outcome
0,tt0094721,Beetlejuice,92 min,A newly dead New England couple seeks help fro...,PG,348874,941.557,7.5,86.0,70.0,...,Alec Baldwin,Geena Davis,Michael Keaton,Comedy,Fantasy,,Tim Burton,,1988,True
1,tt27682129,Prey,100 min,A young couple is compelled to leave their Chr...,R,233550,436.919,7.1,94.0,71.0,...,Amber Midthunder,Dakota Beavers,Dane DiLiegro,Action,Adventure,Horror,Dan Trachtenberg,,2024,False
2,tt0295701,xXx,124 min,Xander Cage is your standard adrenaline junkie...,PG-13,187525,369.083,5.8,48.0,48.0,...,Vin Diesel,Asia Argento,Marton Csokas,Action,Adventure,Thriller,Rob Cohen,,2002,False
3,tt4154756,Avengers: Infinity War,149 min,As the Avengers and their allies have continue...,PG-13,1226533,270.163,8.4,85.0,68.0,...,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,Action,Adventure,Sci-Fi,Anthony Russo,Joe Russo,2018,True
4,tt23778968,Thelma,116 min,When 93-year-old Thelma Post gets duped by a p...,Not Rated,37145,263.208,7.0,92.0,74.0,...,Eili Harboe,Kaya Wilkins,Henrik Rafaelsen,Drama,Fantasy,Horror,Joachim Trier,,2024,True
5,tt0078748,Alien,117 min,"During its return to the earth, commercial spa...",R,964356,231.091,8.5,93.0,89.0,...,Sigourney Weaver,Tom Skerritt,John Hurt,Horror,Sci-Fi,,Ridley Scott,,1979,True
6,tt27534307,Speak No Evil,97 min,When an American family is invited to spend th...,Not Rated,42374,226.591,6.6,84.0,78.0,...,Morten Burian,Sidsel Siem Koch,Fedja van Huêt,Drama,Horror,Thriller,Christian Tafdrup,,2024,False
7,tt2096673,Inside Out,95 min,"When 11-year-old Riley moves to a new city, he...",PG,818922,245.63,8.1,98.0,94.0,...,Amy Poehler,Bill Hader,Lewis Black,Animation,Adventure,Comedy,Pete Docter,Ronnie Del Carmen,2015,True
8,tt0816692,Interstellar,169 min,The adventures of a group of explorers who mak...,PG-13,2153343,217.639,8.7,73.0,74.0,...,Matthew McConaughey,Anne Hathaway,Jessica Chastain,Adventure,Drama,Sci-Fi,Christopher Nolan,,2014,True
9,tt0389722,30 Days of Night,113 min,This is the story of an isolated Alaskan town ...,R,190890,178.579,6.6,51.0,53.0,...,Josh Hartnett,Melissa George,Danny Huston,Action,Horror,Thriller,David Slade,,2007,False


In [3]:
df.columns

Index(['imdb_id', 'title', 'release_year', 'runtime', 'genre', 'overview',
       'director', 'actors', 'rated', 'imdb_votes', 'popularity', 'budget',
       'revenue', 'star_1', 'star_2', 'star_3', 'genre_1', 'genre_2',
       'genre_3', 'director_1', 'director_2'],
      dtype='object')

In [4]:
df.drop(['imdb_id', 'title', 'actors', 'director', 'overview', 'genre', 'revenue', 'imdb_votes','popularity','revenue'], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116 min,R,65000000,Chris Hemsworth,Bryon Lerum,Ryder Lerum,Action,Crime,Thriller,Sam Hargrave,
1,2018,124 min,R,19800000,Sandra Bullock,Trevante Rhodes,John Malkovich,Horror,Mystery,Sci-Fi,Susanne Bier,
2,2004,146 min,R,70000000,Denzel Washington,Christopher Walken,Dakota Fanning,Action,Crime,Drama,Tony Scott,
3,2016,120 min,PG-13,110000000,Shailene Woodley,Theo James,Jeff Daniels,Action,Adventure,Mystery,Robert Schwentke,
4,1987,103 min,R,28000000,Eddie Murphy,Judge Reinhold,Jürgen Prochnow,Action,Comedy,Crime,Tony Scott,


In [6]:
def runtime_cleaner (string):
    minutes = string.split(" ")[0]
    return int(minutes)

df['runtime'] = df['runtime'].map(runtime_cleaner)

In [7]:
df['rated'].unique()

array(['R', 'PG-13', 'TV-14', 'G', 'PG', 'Not Rated', 'Approved', 'TV-MA',
       'NC-17', 'Unrated', nan], dtype=object)

In [8]:
rated_list = df['rated'].unique().tolist()

for i, rating in enumerate(rated_list):
    df['rated'].replace(rating, int(i), inplace=True)


In [9]:
for i, rating in enumerate(rated_list):
    df['rated'].replace(rating, int(i), inplace=True)

df.head()    

Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,Chris Hemsworth,Bryon Lerum,Ryder Lerum,Action,Crime,Thriller,Sam Hargrave,
1,2018,124,0.0,19800000,Sandra Bullock,Trevante Rhodes,John Malkovich,Horror,Mystery,Sci-Fi,Susanne Bier,
2,2004,146,0.0,70000000,Denzel Washington,Christopher Walken,Dakota Fanning,Action,Crime,Drama,Tony Scott,
3,2016,120,1.0,110000000,Shailene Woodley,Theo James,Jeff Daniels,Action,Adventure,Mystery,Robert Schwentke,
4,1987,103,0.0,28000000,Eddie Murphy,Judge Reinhold,Jürgen Prochnow,Action,Comedy,Crime,Tony Scott,


In [10]:
stars_list = [*df["star_1"].tolist(),*df["star_2"].tolist(),*df["star_3"].tolist()]

unique_stars_list = []

for star in stars_list:
    if star not in unique_stars_list:
        unique_stars_list.append(star)
    
print(len(unique_stars_list))

for i, star in enumerate(unique_stars_list):
    df['star_1'].replace(star, int(i), inplace=True)
    df['star_2'].replace(star, int(i), inplace=True)
    df['star_3'].replace(star, int(i), inplace=True)

df.head()  


1623


Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,0,474,1091,Action,Crime,Thriller,Sam Hargrave,
1,2018,124,0.0,19800000,1,475,982,Horror,Mystery,Sci-Fi,Susanne Bier,
2,2004,146,0.0,70000000,2,476,1092,Action,Crime,Drama,Tony Scott,
3,2016,120,1.0,110000000,3,477,531,Action,Adventure,Mystery,Robert Schwentke,
4,1987,103,0.0,28000000,4,478,1093,Action,Comedy,Crime,Tony Scott,


In [11]:
genre_list = [*df["genre_1"].tolist(),*df["genre_2"].tolist(),*df["genre_3"]]

unique_genre_list = []

for genre in genre_list:
    if genre not in unique_genre_list:
        unique_genre_list.append(genre)
    
print(len(unique_genre_list))

for i, genre in enumerate(unique_genre_list):
    df['genre_1'].replace(genre, int(i), inplace=True)
    df['genre_2'].replace(genre, int(i), inplace=True)
    df['genre_3'].replace(genre, int(i), inplace=True)

df.head()    

32


Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,0,474,1091,0,12,24,Sam Hargrave,
1,2018,124,0.0,19800000,1,475,982,1,13,20,Susanne Bier,
2,2004,146,0.0,70000000,2,476,1092,0,12,18,Tony Scott,
3,2016,120,1.0,110000000,3,477,531,0,14,13,Robert Schwentke,
4,1987,103,0.0,28000000,4,478,1093,0,15,12,Tony Scott,


In [12]:
director_list = [*df["director_1"].tolist(),*df["director_2"].tolist()]

unique_director_list = []

for director in director_list:
    if director not in unique_director_list:
        unique_director_list.append(director)
    
print(len(unique_director_list))

for i, director in enumerate(unique_director_list):
    df['director_1'].replace(director, int(i), inplace=True)
    df['director_2'].replace(director, int(i), inplace=True)

df.head()    

594


Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,0,474,1091,0,12,24,0,527
1,2018,124,0.0,19800000,1,475,982,1,13,20,1,527
2,2004,146,0.0,70000000,2,476,1092,0,12,18,2,527
3,2016,120,1.0,110000000,3,477,531,0,14,13,3,527
4,1987,103,0.0,28000000,4,478,1093,0,15,12,2,527


## Separate the Features(X) from the Target (y)


 ## Split our data into training and testing

 ## Create a Logistic Regression Model

 ## Fit (train) or model using the training data

 ## Score the model using the test data

 ## Make predictions

## Calculate the Accuracy Score