In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Create a path to the csv and read it into a Pandas DataFrame
csv_path = "Resources/movie_scores.csv"
movies_df = pd.read_csv(csv_path)

movies_df.head()

Unnamed: 0,FILM,RottenTomatoes,RottenTomatoes_User,Metacritic,Metacritic_User,IMDB,Fandango_Stars,Fandango_Ratingvalue,RT_norm,RT_user_norm,...,IMDB_norm,RT_norm_round,RT_user_norm_round,Metacritic_norm_round,Metacritic_user_norm_round,IMDB_norm_round,Metacritic_user_vote_count,IMDB_user_vote_count,Fandango_votes,Fandango_Difference
0,Avengers: Age of Ultron (2015),74,86,66,7.1,7.8,5.0,4.5,3.7,4.3,...,3.9,3.5,4.5,3.5,3.5,4.0,1330,271107,14846,0.5
1,Cinderella (2015),85,80,67,7.5,7.1,5.0,4.5,4.25,4.0,...,3.55,4.5,4.0,3.5,4.0,3.5,249,65709,12640,0.5
2,Ant-Man (2015),80,90,64,8.1,7.8,5.0,4.5,4.0,4.5,...,3.9,4.0,4.5,3.0,4.0,4.0,627,103660,12055,0.5
3,Do You Believe? (2015),18,84,22,4.7,5.4,5.0,4.5,0.9,4.2,...,2.7,1.0,4.0,1.0,2.5,2.5,31,3136,1793,0.5
4,Hot Tub Time Machine 2 (2015),14,28,29,3.4,5.1,3.5,3.0,0.7,1.4,...,2.55,0.5,1.5,1.5,1.5,2.5,88,19560,1021,0.5


In [3]:
# Figure out the minimum and maximum IMDB user vote count
print(movies_df["IMDB_user_vote_count"].max())
print(movies_df["IMDB_user_vote_count"].min())

334164
243


In [4]:
# Create bins in which to place values based upon IMDB vote count
bins = [0, 2499, 4999, 9999, 14999, 19999, 29999, 49999, 99999, 350000]

# Create labels for these bins
group_labels = ["0 to 2.4k", "2.5k to 4.9k", "5k to 9k", "10k to 14k", "15k to 19k", "20k to 29k",
                "30k to 49k", "50k to 99k", "100k to 350k"]

In [5]:
# Slice the data and place it into bins
pd.cut(movies_df["IMDB_user_vote_count"], bins, labels=group_labels).head()

0    100k to 350k
1      50k to 99k
2    100k to 350k
3    2.5k to 4.9k
4      15k to 19k
Name: IMDB_user_vote_count, dtype: category
Categories (9, object): ['0 to 2.4k' < '2.5k to 4.9k' < '5k to 9k' < '10k to 14k' ... '20k to 29k' < '30k to 49k' < '50k to 99k' < '100k to 350k']

In [6]:
# Place the data series into a new column inside of the DataFrame
movies_df["IMDB User Votes Group"] = pd.cut(movies_df["IMDB_user_vote_count"], bins, labels=group_labels)
movies_df.head()

Unnamed: 0,FILM,RottenTomatoes,RottenTomatoes_User,Metacritic,Metacritic_User,IMDB,Fandango_Stars,Fandango_Ratingvalue,RT_norm,RT_user_norm,...,RT_norm_round,RT_user_norm_round,Metacritic_norm_round,Metacritic_user_norm_round,IMDB_norm_round,Metacritic_user_vote_count,IMDB_user_vote_count,Fandango_votes,Fandango_Difference,IMDB User Votes Group
0,Avengers: Age of Ultron (2015),74,86,66,7.1,7.8,5.0,4.5,3.7,4.3,...,3.5,4.5,3.5,3.5,4.0,1330,271107,14846,0.5,100k to 350k
1,Cinderella (2015),85,80,67,7.5,7.1,5.0,4.5,4.25,4.0,...,4.5,4.0,3.5,4.0,3.5,249,65709,12640,0.5,50k to 99k
2,Ant-Man (2015),80,90,64,8.1,7.8,5.0,4.5,4.0,4.5,...,4.0,4.5,3.0,4.0,4.0,627,103660,12055,0.5,100k to 350k
3,Do You Believe? (2015),18,84,22,4.7,5.4,5.0,4.5,0.9,4.2,...,1.0,4.0,1.0,2.5,2.5,31,3136,1793,0.5,2.5k to 4.9k
4,Hot Tub Time Machine 2 (2015),14,28,29,3.4,5.1,3.5,3.0,0.7,1.4,...,0.5,1.5,1.5,1.5,2.5,88,19560,1021,0.5,15k to 19k


In [7]:
# Create a GroupBy object based upon "IMDB User Votes Group"
imdb_group = movies_df.groupby("IMDB User Votes Group")

# Find how many rows fall into each bin
print(imdb_group["IMDB"].count())

# Get the average of each of the first 5 rating columns within the GroupBy object
imdb_group[["RottenTomatoes", "RottenTomatoes_User", "Metacritic", "Metacritic_User", "IMDB"]].mean()

IMDB User Votes Group
0 to 2.4k       18
2.5k to 4.9k    12
5k to 9k        16
10k to 14k      15
15k to 19k      18
20k to 29k      16
30k to 49k      19
50k to 99k      16
100k to 350k    16
Name: IMDB, dtype: int64


Unnamed: 0_level_0,RottenTomatoes,RottenTomatoes_User,Metacritic,Metacritic_User,IMDB
IMDB User Votes Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0 to 2.4k,81.166667,71.111111,70.722222,7.166667,7.027778
2.5k to 4.9k,46.416667,62.75,49.916667,6.341667,6.45
5k to 9k,61.5,67.125,60.125,7.35625,6.96875
10k to 14k,64.466667,60.6,61.466667,6.4,6.646667
15k to 19k,41.444444,49.555556,45.833333,5.411111,5.983333
20k to 29k,62.375,61.5625,60.6875,6.35625,6.7625
30k to 49k,55.421053,60.789474,54.842105,6.168421,6.626316
50k to 99k,67.875,69.375,63.125,6.71875,7.01875
100k to 350k,64.5,73.0,61.375,6.825,7.15
