In [1]:
# Data Science Libs
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# model preprocessing & metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [45]:
# Import the data - Ratings small
rat_df = pd.read_csv("moviedata/ratings_small.csv")
rat_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
rat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [49]:
rat_df["movieId"].unique().size

9066

In [101]:
df1 = rat_df.groupby("movieId").agg({"movieId" : "first", "rating":"mean"})
df1

Unnamed: 0_level_0,movieId,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,3.872470
2,2,3.401869
3,3,3.161017
4,4,2.384615
5,5,3.267857
...,...,...
161944,161944,5.000000
162376,162376,4.500000
162542,162542,5.000000
162672,162672,3.000000


In [57]:
avg_rat_9066mov = df1.iloc[:, [0, 2]]
avg_rat_9066mov 

Unnamed: 0,movieId,rating
0,1,3.872470
1,2,3.401869
2,3,3.161017
3,4,2.384615
4,5,3.267857
...,...,...
9061,161944,5.000000
9062,162376,4.500000
9063,162542,5.000000
9064,162672,3.000000


In [58]:
avg_rat_9066mov.to_csv("avg_rat_9066mov.csv", index=False)

In [79]:
avg_rat_9066mov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9066 entries, 0 to 9065
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9066 non-null   int64  
 1   rating   9066 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 141.8 KB


In [97]:
avg_rat_9066mov["movieId"].value_counts()

1         1
31963     1
31903     1
31921     1
31923     1
         ..
3770      1
3769      1
3768      1
3766      1
163949    1
Name: movieId, Length: 9066, dtype: int64

In [4]:
# Import the data - Credits
cred_df = pd.read_csv("moviedata/credits.csv")

cred_df = pd.DataFrame(cred_df)
cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [5]:
#credits file is too large to upload to git hub. 
#Extract the string in the columns to get data, then reduce size/take sample of file
cred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [6]:
cred_df.cast.value_counts()

[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [7]:
cred_df.crew.value_counts()

[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [8]:
cred_df.iloc[0,:]

cast    [{'cast_id': 14, 'character': 'Woody (voice)',...
crew    [{'credit_id': '52fe4284c3a36847f8024f49', 'de...
id                                                    862
Name: 0, dtype: object

In [None]:
cred_df

In [None]:
# cred_df["cast_new"] = cred_df["cast"].apply(lambda x: [key for key in x if key != ['order', 'profile_path']])
# cred_df

In [None]:
# df = pd.concat([df.drop('cast', axis=1), df['cast'].apply(pd.Series)], axis=1).drop(0,1)
# df

In [None]:
# cred_sample = cred_df.sample(n=28000, random_state=42)
# cred_sample

In [None]:
# cred_sample.info()

In [None]:
# #Save to new csv file
# cred_sample.to_csv("credits_small.csv", index=False)

In [9]:
# Import the data - Keywords
keywordf = pd.read_csv("moviedata/keywords.csv")
keywordf.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [10]:
keywordf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [11]:
# Import the data - Links small
linkdf = pd.read_csv("moviedata/links_small.csv")
linkdf.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [12]:
linkdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9125 non-null   int64  
 1   imdbId   9125 non-null   int64  
 2   tmdbId   9112 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 214.0 KB


In [25]:
# Import the data - Movies Metadata
#genre[0], budget, popularity, production company, original language, title, production country, 
# release date, revenue runtime, director, production[0]one hot encoding


movdf = pd.read_csv("moviedata/movies_metadata.csv")
movdf.head()

  movdf = pd.read_csv("moviedata/movies_metadata.csv")


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [14]:
movdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [26]:
movdf.drop(columns=["adult", "homepage", "poster_path", "status", "video"], inplace=True)
movdf.head()

Unnamed: 0,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,7.7,5415.0
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


In [72]:
movdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45465 entries, 0 to 45465
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4493 non-null   object 
 1   budget                 45465 non-null  object 
 2   genres                 45465 non-null  object 
 3   id                     45465 non-null  object 
 4   imdb_id                45448 non-null  object 
 5   original_language      45454 non-null  object 
 6   original_title         45465 non-null  object 
 7   overview               44511 non-null  object 
 8   popularity             45461 non-null  object 
 9   production_companies   45462 non-null  object 
 10  production_countries   45462 non-null  object 
 11  release_date           45378 non-null  object 
 12  revenue                45460 non-null  float64
 13  runtime                45203 non-null  float64
 14  spoken_languages       45460 non-null  object 
 15  ta

In [None]:
# #Save to new csv file
# movdf.to_csv("movies_clean.csv", index=False)

In [74]:
movdf["id"].unique().size

45435

In [77]:
movdf["id"].drop_duplicates()

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45461    439050
45462    111109
45463     67758
45464    227506
45465    461257
Name: id, Length: 45435, dtype: object

In [78]:
movdf["id"].astype(int)
movdf.info()

ValueError: invalid literal for int() with base 10: '2012-09-29'

In [66]:
movdf = movdf[movdf["id"] != '1997-08-20'] 

In [80]:
movdf["movieId"]=movdf["id"]
movdf.head()

Unnamed: 0,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,movieId
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,7.7,5415.0,862
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,8844
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,15602
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,31357
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,11862


In [88]:
movdf["movieId"] = movdf["movieId"].apply(lambda x: int(x) if x.isdigit() else 0)

In [90]:
movdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45465 entries, 0 to 45465
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4493 non-null   object 
 1   budget                 45465 non-null  object 
 2   genres                 45465 non-null  object 
 3   id                     45465 non-null  object 
 4   imdb_id                45448 non-null  object 
 5   original_language      45454 non-null  object 
 6   original_title         45465 non-null  object 
 7   overview               44511 non-null  object 
 8   popularity             45461 non-null  object 
 9   production_companies   45462 non-null  object 
 10  production_countries   45462 non-null  object 
 11  release_date           45378 non-null  object 
 12  revenue                45460 non-null  float64
 13  runtime                45203 non-null  float64
 14  spoken_languages       45460 non-null  object 
 15  ta

In [94]:
movdf2 = movdf.merge(avg_rat_9066mov, on="movieId", how="inner")
movdf2

In [95]:
movdf2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2831 entries, 0 to 2830
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  539 non-null    object 
 1   budget                 2831 non-null   object 
 2   genres                 2831 non-null   object 
 3   id                     2831 non-null   object 
 4   imdb_id                2831 non-null   object 
 5   original_language      2831 non-null   object 
 6   original_title         2831 non-null   object 
 7   overview               2810 non-null   object 
 8   popularity             2831 non-null   object 
 9   production_companies   2831 non-null   object 
 10  production_countries   2831 non-null   object 
 11  release_date           2830 non-null   object 
 12  revenue                2831 non-null   float64
 13  runtime                2828 non-null   float64
 14  spoken_languages       2831 non-null   object 
 15  tagl

In [103]:
movdf2.head()

Unnamed: 0,belongs_to_collection,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,movieId,rating
0,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,"[{'name': 'Regency Enterprises', 'id': 508}, {...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",A Los Angeles Crime Saga,Heat,7.7,1886.0,949,3.59375
1,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,"[{'name': 'United Artists', 'id': 60}, {'name'...",...,1995-11-16,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",No limits. No fears. No substitutes.,GoldenEye,6.6,1194.0,710,1.5
2,,98000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",1408,tt0112760,en,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",7.284477,"[{'name': 'Le Studio Canal+', 'id': 183}, {'na...",...,1995-12-22,10017322.0,119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",The Course Has Been Set. There Is No Turning B...,Cutthroat Island,5.7,137.0,1408,3.616279
3,,52000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",524,tt0112641,en,Casino,The life of the gambling paradise – Las Vegas ...,10.137389,"[{'name': 'Universal Pictures', 'id': 33}, {'n...",...,1995-11-22,116112375.0,178.0,"[{'iso_639_1': 'en', 'name': 'English'}]",No one stays at the top forever.,Casino,7.8,1343.0,524,3.555556
4,,16500000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",4584,tt0114388,en,Sense and Sensibility,"Rich Mr. Dashwood dies, leaving his second wif...",10.673167,"[{'name': 'Columbia Pictures Corporation', 'id...",...,1995-12-13,135000000.0,136.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Lose your heart and come to your senses.,Sense and Sensibility,7.2,364.0,4584,5.0


In [117]:
movdf2.genres.value_counts()

[{'id': 18, 'name': 'Drama'}]                                                                                                  279
[{'id': 35, 'name': 'Comedy'}]                                                                                                 160
[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]                                                                116
[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}]                                                                     72
[{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]                                                                66
                                                                                                                              ... 
[{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}, {'id': 10769, 'name': 'Foreign'}]                               1
[{'id': 12, 'name': 'Adventure'}, {'id': 16, 'name': 'Animation'}, {'id': 14, 'name

In [104]:
movdf2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2831 entries, 0 to 2830
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  539 non-null    object 
 1   budget                 2831 non-null   object 
 2   genres                 2831 non-null   object 
 3   id                     2831 non-null   object 
 4   imdb_id                2831 non-null   object 
 5   original_language      2831 non-null   object 
 6   original_title         2831 non-null   object 
 7   overview               2810 non-null   object 
 8   popularity             2831 non-null   object 
 9   production_companies   2831 non-null   object 
 10  production_countries   2831 non-null   object 
 11  release_date           2830 non-null   object 
 12  revenue                2831 non-null   float64
 13  runtime                2828 non-null   float64
 14  spoken_languages       2831 non-null   object 
 15  tagl

In [108]:
df3 = movdf2.loc[:, ["budget", "popularity", "revenue", "vote_average","vote_count", "rating"]]
df3

Unnamed: 0,budget,popularity,revenue,vote_average,vote_count,rating
0,60000000,17.924927,187436818.0,7.7,1886.0,3.593750
1,58000000,14.686036,352194034.0,6.6,1194.0,1.500000
2,98000000,7.284477,10017322.0,5.7,137.0,3.616279
3,52000000,10.137389,116112375.0,7.8,1343.0,3.555556
4,16500000,10.673167,135000000.0,7.2,364.0,5.000000
...,...,...,...,...,...,...
2826,0,0.083371,0.0,6.5,2.0,3.250000
2827,0,2.302582,0.0,5.9,33.0,3.965517
2828,0,0.528657,0.0,6.0,5.0,4.166667
2829,0,0.803588,0.0,4.6,6.0,4.000000


In [113]:
y=df3["rating"]
X=df3.iloc[:,:-1]

In [114]:
X

Unnamed: 0,budget,popularity,revenue,vote_average,vote_count
0,60000000,17.924927,187436818.0,7.7,1886.0
1,58000000,14.686036,352194034.0,6.6,1194.0
2,98000000,7.284477,10017322.0,5.7,137.0
3,52000000,10.137389,116112375.0,7.8,1343.0
4,16500000,10.673167,135000000.0,7.2,364.0
...,...,...,...,...,...
2826,0,0.083371,0.0,6.5,2.0
2827,0,2.302582,0.0,5.9,33.0
2828,0,0.528657,0.0,6.0,5.0
2829,0,0.803588,0.0,4.6,6.0


In [115]:
y

0       3.593750
1       1.500000
2       3.616279
3       3.555556
4       5.000000
          ...   
2826    3.250000
2827    3.965517
2828    4.166667
2829    4.000000
2830    5.000000
Name: rating, Length: 2831, dtype: float64

In [16]:
movdf.isna().sum()

belongs_to_collection    40972
budget                       0
genres                       0
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
tagline                  25054
title                        6
vote_average                 6
vote_count                   6
dtype: int64

In [17]:
movdf.genres.value_counts()

[{'id': 18, 'name': 'Drama'}]                                                                                                         5000
[{'id': 35, 'name': 'Comedy'}]                                                                                                        3621
[{'id': 99, 'name': 'Documentary'}]                                                                                                   2723
[]                                                                                                                                    2442
[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]                                                                       1301
                                                                                                                                      ... 
[{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 35, 'name': 'Comedy'}, {'id': 99, 'name': 'Documentary'}]             1
[{'id': 10752, 'name': 'War

In [18]:
movdf.belongs_to_collection.value_counts()

{'id': 415931, 'name': 'The Bowery Boys', 'poster_path': '/q6sA4bzMT9cK7EEmXYwt7PNrL5h.jpg', 'backdrop_path': '/foe3kuiJmg5AklhtD3skWbaTMf2.jpg'}                 29
{'id': 421566, 'name': 'Totò Collection', 'poster_path': '/4ayJsjC3djGwU9eCWUokdBWvdLC.jpg', 'backdrop_path': '/jaUuprubvAxXLAY5hUfrNjxccUh.jpg'}                 27
{'id': 645, 'name': 'James Bond Collection', 'poster_path': '/HORpg5CSkmeQlAolx3bKMrKgfi.jpg', 'backdrop_path': '/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg'}               26
{'id': 96887, 'name': 'Zatôichi: The Blind Swordsman', 'poster_path': '/8Q31DAtmFJjhFTwQGXghBUCgWK2.jpg', 'backdrop_path': '/bY8gLImMR5Pr9PaG3ZpobfaAQ8N.jpg'}    26
{'id': 37261, 'name': 'The Carry On Collection', 'poster_path': '/2P0HNrYgKDvirV8RCdT1rBSJdbJ.jpg', 'backdrop_path': '/38tF1LJN7ULeZAuAfP7beaPMfcl.jpg'}          25
                                                                                                                                                                  ..
{'id': 456

In [20]:
movdf.production_companies.value_counts()

[]                                                                                                                                                                                                                                                                                                                                11875
[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8411}]                                                                                                                                                                                                                                                                                 742
[{'name': 'Warner Bros.', 'id': 6194}]                                                                                                                                                                                                                                                                                              540
[{'name': 'Param

In [21]:
movdf.production_countries.value_counts()

[{'iso_3166_1': 'US', 'name': 'United States of America'}]                                                                                                          17851
[]                                                                                                                                                                   6282
[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]                                                                                                                     2238
[{'iso_3166_1': 'FR', 'name': 'France'}]                                                                                                                             1654
[{'iso_3166_1': 'JP', 'name': 'Japan'}]                                                                                                                              1356
                                                                                                                                                      

In [22]:
movdf.spoken_languages.value_counts()

[{'iso_639_1': 'en', 'name': 'English'}]                                                                                                                                                                                         22395
[]                                                                                                                                                                                                                                3829
[{'iso_639_1': 'fr', 'name': 'Français'}]                                                                                                                                                                                         1853
[{'iso_639_1': 'ja', 'name': '日本語'}]                                                                                                                                                                                              1289
[{'iso_639_1': 'it', 'name': 'Italiano'}]                                   

In [30]:
#Recommender: Demographic Filtering
#we need a metric to score or rate movie. Calculate the score for every movie
#Sort the scores and recommend the best rated movie to the users.

In [35]:
#Weighted rating - mean
c = movdf["vote_average"].mean()
c

5.618207215133889

In [36]:
# minimum votes required to be listed in the chart 
m= movdf['vote_count'].quantile(0.9)
m

160.0

In [37]:
# filter out the movies that qualify for the chart
q_movies = movdf.copy().loc[movdf['vote_count'] >= m]
q_movies.shape

(4555, 19)

In [38]:
def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * c)

In [39]:
# Define a new feature 'score' and calculate its value with 'weighted_rating()'
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [40]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [44]:
# Trending Now -- sort the dataset by the popularity column

pop= movdf.sort_values("popularity", ascending=False)

plt.figure(figsize=(12,4))
plt.barh(pop['title'].head(10),pop["popularity"].head(6), align='center', color='green')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")

TypeError: '<' not supported between instances of 'float' and 'str'