In [3]:
import pandas as pd

In [4]:
# Read the CSV into a DataFrame
titles_df = pd.read_csv("./Data Files/titles_updated.csv")
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,description,IMDb,Genre
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,In a future where the elite inhabit an island ...,7.4,Drama
1,7:19,Movie,Mexico,2016-12-23,2016,TV-MA,After a devastating earthquake hits Mexico Cit...,6.0,Drama
2,23:59,Movie,Singapore,2018-12-20,2011,R,"When an army recruit is found dead, his fellow...",4.7,Horror
3,9,Movie,United States,2017-11-16,2009,PG-13,"In a postapocalyptic world, rag-doll robots hi...",7.1,Action & Adventure
4,21,Movie,United States,2020-01-01,2008,PG-13,A brilliant group of students become card-coun...,6.8,Drama
...,...,...,...,...,...,...,...,...,...
3886,Zona Rosa,TV Show,Mexico,2019-11-26,2019,TV-MA,An assortment of talent takes the stage for a ...,5.8,Comedy
3887,Zoo,Movie,India,2018-07-01,2018,TV-MA,A drug dealer starts having doubts about his t...,6.8,Drama
3888,Zoom,Movie,United States,2020-01-11,2006,PG,"Dragged from civilian life, a former superhero...",4.4,Comedy
3889,Zubaan,Movie,India,2019-03-02,2015,TV-14,A scrappy but poor boy worms his way into a ty...,6.1,Drama


## Average ratings per genre

In [5]:
# Calculate IMDb rating averages per genre
genre_ratings = titles_df.groupby('Genre')['IMDb'].mean()
genre_ratings

Genre
Action & Adventure    6.144444
Anime                 7.664444
Children              6.468649
Comedy                6.343832
Crime                 7.410385
Documentaries         7.027664
Drama                 6.570829
Horror                5.521622
Musicals              7.375000
Mystery               8.650000
Reality               6.484416
Romance               7.078182
Sci-Fi & Fantasy      5.819048
Teen                  7.183333
Thrillers             5.850000
Name: IMDb, dtype: float64

In [6]:
# Create genre rating dataframe
genre_df = pd.DataFrame(genre_ratings)
genre_df

Unnamed: 0_level_0,IMDb
Genre,Unnamed: 1_level_1
Action & Adventure,6.144444
Anime,7.664444
Children,6.468649
Comedy,6.343832
Crime,7.410385
Documentaries,7.027664
Drama,6.570829
Horror,5.521622
Musicals,7.375
Mystery,8.65


In [7]:
# Rename the column
genre_df.rename(columns = {"IMDb":"Avg rating per genre"}, inplace = True)
genre_df

Unnamed: 0_level_0,Avg rating per genre
Genre,Unnamed: 1_level_1
Action & Adventure,6.144444
Anime,7.664444
Children,6.468649
Comedy,6.343832
Crime,7.410385
Documentaries,7.027664
Drama,6.570829
Horror,5.521622
Musicals,7.375
Mystery,8.65


In [8]:
# Merge the dataframes
titles_df = titles_df.merge(genre_df, how = 'right', on = 'Genre')
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,description,IMDb,Genre,Avg rating per genre
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,In a future where the elite inhabit an island ...,7.4,Drama,6.570829
1,7:19,Movie,Mexico,2016-12-23,2016,TV-MA,After a devastating earthquake hits Mexico Cit...,6.0,Drama,6.570829
2,21,Movie,United States,2020-01-01,2008,PG-13,A brilliant group of students become card-coun...,6.8,Drama,6.570829
3,46,TV Show,Turkey,2017-07-01,2016,TV-MA,A genetics professor experiments with a treatm...,8.5,Drama,6.570829
4,1922,Movie,United States,2017-10-20,2017,TV-MA,A farmer pens a confession admitting to his wi...,6.3,Drama,6.570829
...,...,...,...,...,...,...,...,...,...,...
3886,The Disastrous Life of Saiki K.,TV Show,Japan,2019-03-01,2018,TV-14,High school sophomore Kusuo Saiki swore as a c...,8.3,Teen,7.183333
3887,The Irregular at Magic High School,TV Show,Japan,2017-11-15,2014,TV-14,Siblings Tatsuya and Miyuki enroll at Magic Hi...,7.2,Teen,7.183333
3888,Two Sentence Horror Stories,TV Show,United States,2019-09-20,2019,TV-14,This anthology series of terror features diver...,5.5,Teen,7.183333
3889,Monty Python's Flying Circus,TV Show,United Kingdom,2018-10-02,1974,TV-14,The Monty Python players make their mark with ...,8.8,Mystery,8.650000


In [9]:
# Convert column "Avg Rating" to integer
titles_df['Avg rating per genre'] = pd.to_numeric(titles_df['Avg rating per genre'])

## Average ratings per country

In [10]:
# Calculate IMDb rating averages per country
country_ratings = titles_df.groupby('country')['IMDb'].mean()
country_ratings

country
Argentina         6.409677
Australia         6.734848
Austria           6.285714
Bangladesh        7.400000
Belgium           6.977778
                    ...   
United Kingdom    6.860681
United States     6.524263
Uruguay           6.425000
Venezuela         7.000000
Vietnam           6.250000
Name: IMDb, Length: 71, dtype: float64

In [11]:
# Create genre rating dataframe
country_df = pd.DataFrame(country_ratings)
country_df

Unnamed: 0_level_0,IMDb
country,Unnamed: 1_level_1
Argentina,6.409677
Australia,6.734848
Austria,6.285714
Bangladesh,7.400000
Belgium,6.977778
...,...
United Kingdom,6.860681
United States,6.524263
Uruguay,6.425000
Venezuela,7.000000


In [12]:
# Rename the column
country_df.rename(columns = {"IMDb":"Avg rating per country"}, inplace = True)
country_df

Unnamed: 0_level_0,Avg rating per country
country,Unnamed: 1_level_1
Argentina,6.409677
Australia,6.734848
Austria,6.285714
Bangladesh,7.400000
Belgium,6.977778
...,...
United Kingdom,6.860681
United States,6.524263
Uruguay,6.425000
Venezuela,7.000000


In [13]:
# Merge the dataframes
titles_df = titles_df.merge(country_df, how = 'right', on = 'country')
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,description,IMDb,Genre,Avg rating per genre,Avg rating per country
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,In a future where the elite inhabit an island ...,7.4,Drama,6.570829,6.494118
1,Kardec,Movie,Brazil,2019-08-29,2019,TV-14,"In Catholic 19th-century France, professor Léo...",6.2,Drama,6.570829,6.494118
2,Most Beautiful Thing,TV Show,Brazil,2020-06-19,2020,TV-MA,A 1950s housewife goes to Rio de Janeiro to me...,7.9,Drama,6.570829,6.494118
3,Nothing to Lose,Movie,Brazil,2018-07-20,2018,PG,This biographical drama traces the rise to fam...,5.5,Drama,6.570829,6.494118
4,Omniscient,TV Show,Brazil,2020-01-29,2020,TV-MA,"In a city where citizens are monitored 24/7, a...",6.3,Drama,6.570829,6.494118
...,...,...,...,...,...,...,...,...,...,...,...
3886,In Paradox,Movie,Kuwait,2020-05-07,2019,TV-14,"On the run from assailants, a man desperately ...",6.1,Sci-Fi & Fantasy,5.819048,6.750000
3887,The End,Movie,Kuwait,2020-05-17,2019,TV-MA,A wrongly accused man is pursued by a dogged i...,7.4,Comedy,6.343832,6.750000
3888,"Houston, We Have a Problem!",Movie,Slovenia,2017-07-20,2016,TV-14,"Blending fact with myth, this conspiracy-minde...",7.9,Comedy,6.343832,7.900000
3889,Servant of the People,TV Show,Ukraine,2017-03-01,2015,TV-14,After a Ukrainian high school teacher's tirade...,6.9,Comedy,6.343832,6.900000


In [14]:
# Convert column "Avg Rating" to integer
titles_df['Avg rating per country'] = pd.to_numeric(titles_df['Avg rating per country'])

## Average ratings per release year

In [15]:
# Calculate IMDb rating averages per release year
year_ratings = titles_df.groupby('release_year')['IMDb'].mean()
year_ratings

release_year
1942    6.100000
1943    7.100000
1944    6.250000
1945    6.933333
1946    7.400000
          ...   
2017    6.436285
2018    6.445727
2019    6.531196
2020    6.985106
2021    7.600000
Name: IMDb, Length: 63, dtype: float64

In [16]:
# Create year rating dataframe
year_df = pd.DataFrame(year_ratings)
year_df

Unnamed: 0_level_0,IMDb
release_year,Unnamed: 1_level_1
1942,6.100000
1943,7.100000
1944,6.250000
1945,6.933333
1946,7.400000
...,...
2017,6.436285
2018,6.445727
2019,6.531196
2020,6.985106


In [17]:
# Rename the column
year_df.rename(columns = {"IMDb":"Avg rating per release_year"}, inplace = True)
year_df

Unnamed: 0_level_0,Avg rating per release_year
release_year,Unnamed: 1_level_1
1942,6.100000
1943,7.100000
1944,6.250000
1945,6.933333
1946,7.400000
...,...
2017,6.436285
2018,6.445727
2019,6.531196
2020,6.985106


In [18]:
# Merge the dataframes
titles_df = titles_df.merge(year_df, how = 'right', on = 'release_year')
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,description,IMDb,Genre,Avg rating per genre,Avg rating per country,Avg rating per release_year
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,In a future where the elite inhabit an island ...,7.4,Drama,6.570829,6.494118,6.985106
1,Most Beautiful Thing,TV Show,Brazil,2020-06-19,2020,TV-MA,A 1950s housewife goes to Rio de Janeiro to me...,7.9,Drama,6.570829,6.494118,6.985106
2,Omniscient,TV Show,Brazil,2020-01-29,2020,TV-MA,"In a city where citizens are monitored 24/7, a...",6.3,Drama,6.570829,6.494118,6.985106
3,Spectros,TV Show,Brazil,2020-02-20,2020,TV-MA,A group of teens get caught up in a supernatur...,4.4,Horror,5.521622,6.494118,6.985106
4,Rich in Love,Movie,Brazil,2020-04-30,2020,TV-14,Rich kid Teto is determined to prove himself t...,6.0,Comedy,6.343832,6.494118,6.985106
...,...,...,...,...,...,...,...,...,...,...,...,...
3886,Manoranjan,Movie,India,2017-09-01,1974,TV-14,"After getting fired, a police officer connects...",6.8,Comedy,6.343832,6.342549,7.066667
3887,Immoral Tales,Movie,France,2019-06-06,1974,UR,This anthology illustrates the timeless nature...,5.6,Drama,6.570829,6.472381,7.066667
3888,Monty Python's Flying Circus,TV Show,United Kingdom,2018-10-02,1974,TV-14,The Monty Python players make their mark with ...,8.8,Mystery,8.650000,6.860681,7.066667
3889,Professor,Movie,India,2017-07-01,1962,TV-PG,A college grad in need of money must disguise ...,7.0,Comedy,6.343832,6.342549,7.000000


In [19]:
# Convert column "Avg Rating" to integer
titles_df['Avg rating per release_year'] = pd.to_numeric(titles_df['Avg rating per release_year'])

## Combine all averages

In [20]:
titles_df['Combined Average Ratings'] = ((titles_df['Avg rating per genre'] +
                                        titles_df['Avg rating per country'] +
                                        titles_df['Avg rating per release_year'])/3)                               
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,description,IMDb,Genre,Avg rating per genre,Avg rating per country,Avg rating per release_year,Combined Average Ratings
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,In a future where the elite inhabit an island ...,7.4,Drama,6.570829,6.494118,6.985106,6.683351
1,Most Beautiful Thing,TV Show,Brazil,2020-06-19,2020,TV-MA,A 1950s housewife goes to Rio de Janeiro to me...,7.9,Drama,6.570829,6.494118,6.985106,6.683351
2,Omniscient,TV Show,Brazil,2020-01-29,2020,TV-MA,"In a city where citizens are monitored 24/7, a...",6.3,Drama,6.570829,6.494118,6.985106,6.683351
3,Spectros,TV Show,Brazil,2020-02-20,2020,TV-MA,A group of teens get caught up in a supernatur...,4.4,Horror,5.521622,6.494118,6.985106,6.333615
4,Rich in Love,Movie,Brazil,2020-04-30,2020,TV-14,Rich kid Teto is determined to prove himself t...,6.0,Comedy,6.343832,6.494118,6.985106,6.607685
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3886,Manoranjan,Movie,India,2017-09-01,1974,TV-14,"After getting fired, a police officer connects...",6.8,Comedy,6.343832,6.342549,7.066667,6.584349
3887,Immoral Tales,Movie,France,2019-06-06,1974,UR,This anthology illustrates the timeless nature...,5.6,Drama,6.570829,6.472381,7.066667,6.703292
3888,Monty Python's Flying Circus,TV Show,United Kingdom,2018-10-02,1974,TV-14,The Monty Python players make their mark with ...,8.8,Mystery,8.650000,6.860681,7.066667,7.525783
3889,Professor,Movie,India,2017-07-01,1962,TV-PG,A college grad in need of money must disguise ...,7.0,Comedy,6.343832,6.342549,7.000000,6.562127


## Create the "Growth Outcomes" column

In [21]:
# Round to the 2nd decimal place
titles_df['Combined Average Ratings'] = titles_df['Combined Average Ratings'].apply(lambda x: round(x, 2))
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,description,IMDb,Genre,Avg rating per genre,Avg rating per country,Avg rating per release_year,Combined Average Ratings
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,In a future where the elite inhabit an island ...,7.4,Drama,6.570829,6.494118,6.985106,6.68
1,Most Beautiful Thing,TV Show,Brazil,2020-06-19,2020,TV-MA,A 1950s housewife goes to Rio de Janeiro to me...,7.9,Drama,6.570829,6.494118,6.985106,6.68
2,Omniscient,TV Show,Brazil,2020-01-29,2020,TV-MA,"In a city where citizens are monitored 24/7, a...",6.3,Drama,6.570829,6.494118,6.985106,6.68
3,Spectros,TV Show,Brazil,2020-02-20,2020,TV-MA,A group of teens get caught up in a supernatur...,4.4,Horror,5.521622,6.494118,6.985106,6.33
4,Rich in Love,Movie,Brazil,2020-04-30,2020,TV-14,Rich kid Teto is determined to prove himself t...,6.0,Comedy,6.343832,6.494118,6.985106,6.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3886,Manoranjan,Movie,India,2017-09-01,1974,TV-14,"After getting fired, a police officer connects...",6.8,Comedy,6.343832,6.342549,7.066667,6.58
3887,Immoral Tales,Movie,France,2019-06-06,1974,UR,This anthology illustrates the timeless nature...,5.6,Drama,6.570829,6.472381,7.066667,6.70
3888,Monty Python's Flying Circus,TV Show,United Kingdom,2018-10-02,1974,TV-14,The Monty Python players make their mark with ...,8.8,Mystery,8.650000,6.860681,7.066667,7.53
3889,Professor,Movie,India,2017-07-01,1962,TV-PG,A college grad in need of money must disguise ...,7.0,Comedy,6.343832,6.342549,7.000000,6.56


In [22]:
# Dsiplay combined average ratings grouped in descending order of ratings
titles_df['Combined Average Ratings'].sort_values(ascending=True)

856     5.33
1523    5.71
3139    5.79
2402    5.81
3126    5.82
        ... 
205     7.33
204     7.33
3310    7.37
3536    7.51
3888    7.53
Name: Combined Average Ratings, Length: 3891, dtype: float64

In [41]:
# Create a new column called "Growth Outcomes" with a for loop to determine 0s and 1s for growth outcomes

result = [] 
for x in titles_df['Combined Average Ratings']: 
    if x >= 6.5: 
        result.append("1") 
    else: 
        result.append("0") 
        
titles_df["Growth Outcome"] = result

In [42]:
# Confirm that it worked
titles_df.groupby('Growth Outcome').size()

Growth Outcome
0    1823
1    2068
dtype: int64

In [43]:
# Drop the 5 average ratings and description columns so it doesn't overcrowd the machine learning model
titles_df = titles_df.drop(['Combined Average Ratings','Avg rating per genre','Avg rating per country',
                            'Avg rating per release_year', 'description'],axis = 'columns')
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,IMDb,Genre,Growth Outcome
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,7.4,Drama,1
1,Most Beautiful Thing,TV Show,Brazil,2020-06-19,2020,TV-MA,7.9,Drama,1
2,Omniscient,TV Show,Brazil,2020-01-29,2020,TV-MA,6.3,Drama,1
3,Spectros,TV Show,Brazil,2020-02-20,2020,TV-MA,4.4,Horror,0
4,Rich in Love,Movie,Brazil,2020-04-30,2020,TV-14,6.0,Comedy,1
...,...,...,...,...,...,...,...,...,...
3886,Manoranjan,Movie,India,2017-09-01,1974,TV-14,6.8,Comedy,1
3887,Immoral Tales,Movie,France,2019-06-06,1974,UR,5.6,Drama,1
3888,Monty Python's Flying Circus,TV Show,United Kingdom,2018-10-02,1974,TV-14,8.8,Mystery,1
3889,Professor,Movie,India,2017-07-01,1962,TV-PG,7.0,Comedy,1


## Export final dataset for the machine learning model

In [44]:
# Use "Title" as index
titles_df = titles_df.set_index("Title")
titles_df

Unnamed: 0_level_0,type,country,date_added,release_year,rating,IMDb,Genre,Growth Outcome
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3%,TV Show,Brazil,2020-08-14,2020,TV-MA,7.4,Drama,1
Most Beautiful Thing,TV Show,Brazil,2020-06-19,2020,TV-MA,7.9,Drama,1
Omniscient,TV Show,Brazil,2020-01-29,2020,TV-MA,6.3,Drama,1
Spectros,TV Show,Brazil,2020-02-20,2020,TV-MA,4.4,Horror,0
Rich in Love,Movie,Brazil,2020-04-30,2020,TV-14,6.0,Comedy,1
...,...,...,...,...,...,...,...,...
Manoranjan,Movie,India,2017-09-01,1974,TV-14,6.8,Comedy,1
Immoral Tales,Movie,France,2019-06-06,1974,UR,5.6,Drama,1
Monty Python's Flying Circus,TV Show,United Kingdom,2018-10-02,1974,TV-14,8.8,Mystery,1
Professor,Movie,India,2017-07-01,1962,TV-PG,7.0,Comedy,1


In [45]:
# Export the final dataframe to a csv file
titles_df.to_csv("./Data Files/machine_learning_titles.csv")