# Setup

In [1]:
#dependecies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as sts

#links for project
# Brainstorming: https://docs.google.com/document/d/12Z4DCdVAte6-cMMAv_EZIdZdDpvUzxOuDvNcNFio4PM/edit

# Importing/Merging DFs

In [2]:
#variables for forloop
import_filenames = ['disney_plus_content_1.csv', 'disney_plus_content_2.csv', 'movies_per_streaming.csv', 'shows_per_streaming.csv']

#variable to store all df data
df_list = []

#importing data
for path in import_filenames:
    df_list.append(pd.read_csv(f"Resources/{path}"))

We did a manual review of all the columns in our 10+ datasets and decided which columns to keep:

- Netflix
- Disney
- Hulu
------
- Title
- Type (movie vs show)
- Genre
- Rated (PG13)
- Starring/Cast
- Directors
- Year relased
- Age
- Awards
- Language
- Country
- Date_Added
- Release_Year
- Rotten Tomatoes
- IMDb
- Imdb_Rating
- Imdb_Votes
- Metascore
- Box Office
- Budget
- Running Time

### Precleaning

In [3]:
#defining function for dropping columns
def dropper(df, drop_columns):
    df = df.drop(drop_columns, axis = 1)
    return df

In [4]:
#variable for loop
df_column_list = [
    ['Imdb_Id', 'Plot', 'Released_At', 'Writer'],
    ['Show_Id', 'Description'],
    ['Unnamed: 0', 'Id', 'Netflix', 'Hulu', 'Prime Video'],
    ['Unnamed: 0', 'Id', 'Netflix', 'Hulu', 'Prime Video']
]

In [5]:
#capitalizing and then dropping unnecessary columns
for i in range(len(df_list)):
    df_list[i].columns = map(str.title, df_list[i].columns)
    df_list[i] = dropper(df_list[i], df_column_list[i])

In [6]:
#manually slicing of df2,3
for i in range(2,4):
    df_list[i] = df_list[i].loc[df_list[i]['Disney+'] == 1]
    df_list[i] = dropper(df_list[i], ['Disney+'])

### Merging DFs

In [7]:
#merging all dfs
merged_df_01 = pd.merge(df_list[0], df_list[1], on = 'Title', how = 'outer')
merged_df_34 = pd.merge(df_list[2], df_list[3], on = 'Title', how = 'outer')
merged_df = pd.merge(merged_df_01, merged_df_34, on = 'Title', how = 'outer')
merged_df.head()

Unnamed: 0,Title,Type_x_x,Rated,Year,Added_At,Runtime_x,Genre,Director_x,Actors,Language_x,...,Directors,Genres,Country,Language_y,Runtime_y,Year_y,Age_y,Imdb_y,Rotten Tomatoes_y,Type_y_y
0,10 Things I Hate About You,movie,PG-13,1999,"November 12, 2019",97 min,"Comedy, Drama, Romance",Gil Junger,"Heath Ledger, Julia Stiles, Joseph Gordon-Levi...","English, French",...,,,,,,2009.0,16+,7.3/10,67/100,1.0
1,101 Dalmatian Street,series,,2018–,"February 28, 2020",,"Animation, Comedy, Family",,"Josh Brener, Michaela Dietz, Bert Davis, Abiga...",English,...,,,,,,2019.0,7+,6.3/10,44/100,1.0
2,101 Dalmatians,movie,G,1996,"November 12, 2019",103 min,"Adventure, Comedy, Crime, Family",Stephen Herek,"Glenn Close, Jeff Daniels, Joely Richardson, J...","English, Spanish",...,,,,,,1997.0,,,21/100,1.0
3,101 Dalmatians 2: Patch's London Adventure,movie,G,2002,"November 12, 2019",74 min,"Animation, Adventure, Comedy, Family, Musical","Jim Kammerud, Brian Smith","Barry Bostwick, Jason Alexander, Martin Short,...",English,...,,,,,,,,,,
4,102 Dalmatians,movie,G,2000,"November 12, 2019",100 min,"Adventure, Comedy, Family",Kevin Lima,"Glenn Close, Gérard Depardieu, Ioan Gruffudd, ...",English,...,Kevin Lima,"Adventure,Comedy,Family","United States,United Kingdom,France",English,100.0,,,,,


In [8]:
#saving file for manual check
merged_df.to_csv('Resources/raw_merge_df.csv')

# Cleaning DF

In [9]:
#checking columns
print(merged_df.columns)

Index(['Title', 'Type_x_x', 'Rated', 'Year', 'Added_At', 'Runtime_x', 'Genre',
       'Director_x', 'Actors', 'Language_x', 'Country_x', 'Awards',
       'Metascore', 'Imdb_Rating', 'Imdb_Votes', 'Type_y_x', 'Director_y',
       'Cast', 'Country_y', 'Date_Added', 'Release_Year', 'Rating', 'Duration',
       'Listed_In', 'Year_x', 'Age_x', 'Imdb_x', 'Rotten Tomatoes_x',
       'Type_x_y', 'Directors', 'Genres', 'Country', 'Language_y', 'Runtime_y',
       'Year_y', 'Age_y', 'Imdb_y', 'Rotten Tomatoes_y', 'Type_y_y'],
      dtype='object')


Columns to combine
- Title', 
- Type_x_x', 'Type_y_x', Type_x_y, Type_y_y'
- Year', 'Release_Year' 'Year_x', 'Year_y'
- Rated', 'Rating'
- Age_x', 'Age_y'
- Added_At', 'Date_Added'
- Runtime_x', 'Duration', Runtime_y'
- Genre','Listed_In', 'Genres'
- Director_x','Director_y', 'Directors'
- Actors', 'Cast'
- Language_x', 'Language_y
- Country_x', 'Country_y', 'Country'
- Awards',
- Metascore',
- Imdb_x', 'Imdb_y
- Imdb_Rating',
- Imdb_Votes',
- Rotten Tomatoes_x', 'Rotten Tomatoes_y

### Functions

In [10]:
#writing a function that cleanly combines 2 columns
def combined_2_columns(df, anchor_column, anchor_column_new_name, collapse_column, dropped_list, drop):
    '''
    anchor_column is the column series that you want everything to be compared to 
    anchor_column_new_name is the new name of that column
    collapse_column is the list of columns to combine
    dropped_list is an empty list that will track which values have NaN, used for debugging
    drop whether or not to drop the values
    '''
    # populate relevant column, and if no values then drop
    for i in range(len(df[anchor_column])):
        if (pd.isnull(df[anchor_column][i])):
            if (pd.isnull(df[collapse_column][i])):
                dropped_list.append(i)
                if (drop == True):
                    df = df.drop(i, axis = 0)
            else:
                df.loc[i, anchor_column] = df.loc[i, collapse_column]

    # fixing columns
    df.reset_index(inplace = True)
    df.drop([collapse_column, 'index'], axis = 1, inplace = True)
    df.rename(columns = {anchor_column : anchor_column_new_name}, inplace = True)
    if (drop == True):
        return print(f"Cleaning {anchor_column_new_name}, dropped {len(dropped_list)}.")
    else:
        return print(f"Cleaning {anchor_column_new_name}, has {len(dropped_list)} NaN's.")

In [11]:
#writing functions for 3 columns
def combined_3_columns(df, anchor_column, anchor_column_new_name, collapse_column, dropped_list, drop):
    '''
    anchor_column is the column series that you want everything to be compared to 
    anchor_column_new_name is the new name of that column
    collapse_column is the list of columns to combine
    dropped_list is an empty list that will track which values have NaN, used for debugging
    drop whether or not to drop the values
    '''
    # populate relevant column, and if no values then drop
    for i in range(len(df[anchor_column])):
        if (pd.isnull(df[anchor_column][i])):
            if (pd.isnull(df[collapse_column[0]][i])):
                if (pd.isnull(df[collapse_column[1]][i])):
                    dropped_list.append(i)
                    if (drop == True):
                        df = df.drop(i, axis = 0)
                else:
                    df.loc[i, anchor_column] = df.loc[i, collapse_column[1]]
            else:
                df.loc[i, anchor_column] = df.loc[i, collapse_column[0]]

    # fixing columns
    df.reset_index(inplace = True)
    collapse_column.append('index')
    df.drop(collapse_column, axis = 1, inplace = True)
    df.rename(columns = {anchor_column : anchor_column_new_name}, inplace = True)
    if (drop == True):
        return print(f"Cleaning {anchor_column_new_name}, dropped {len(dropped_list)}.")
    else:
        return print(f"Cleaning {anchor_column_new_name}, has {len(dropped_list)} NaN's.")

In [12]:
#writing functions for 4 columns
def combined_4_columns(df, anchor_column, anchor_column_new_name, collapse_column, dropped_list, drop):
    '''
    anchor_column is the column series that you want everything to be compared to 
    anchor_column_new_name is the new name of that column
    collapse_column is the list of columns to combine
    dropped_list is an empty list that will track which values have NaN, used for debugging
    drop whether or not to drop the values
    '''
    # populate relevant column, and if no values then drop
    for i in range(len(df[anchor_column])):
        if (pd.isnull(df[anchor_column][i])):
            if (pd.isnull(df[collapse_column[0]][i])):
                if (pd.isnull(df[collapse_column[1]][i])):
                    if (pd.isnull(df[collapse_column[2]][i])):
                        dropped_list.append(i)
                        if (drop == True):
                            df = df.drop(i, axis = 0)
                    else:
                        df.loc[i, anchor_column] = df.loc[i, collapse_column[2]]
                else:
                    df.loc[i, anchor_column] = df.loc[i, collapse_column[1]]
            else:
                df.loc[i, anchor_column] = df.loc[i, collapse_column[0]]

    # fixing columns
    df.reset_index(inplace = True)
    collapse_column.append('index')
    df.drop(collapse_column, axis = 1, inplace = True)
    df.rename(columns = {anchor_column : anchor_column_new_name}, inplace = True)
    if (drop == True):
        return print(f"Cleaning {anchor_column_new_name}, dropped {len(dropped_list)}.")
    else:
        return print(f"Cleaning {anchor_column_new_name}, has {len(dropped_list)} NaN's.")

### Condensing 2-Columns

In [13]:
#variables for for loop
anchor_ls = ['Age_x', 'Rated', 'Added_At', 'Rotten Tomatoes_x', 'Actors', 'Language_x']
anchor_name_ls = ['Age Advisory Rating', 'Advisory Rating', 'Available to Public on', 'Rotten Tomatoes Score', 'Cast', 'Available Languages']
column_ls = ['Age_y', 'Rating', 'Date_Added', 'Rotten Tomatoes_y', 'Cast', 'Language_y']
boolean_ls = [False, False, False, False, False, False]

In [14]:
#Age, Rating, Date Added, Rotten, IMDB, Cast, Languages
for i in range(len(anchor_ls)):
    dropped_list = [] #reseting for every df
    combined_2_columns(merged_df, anchor_ls[i], anchor_name_ls[i], column_ls[i], dropped_list, boolean_ls[i])

Cleaning Age Advisory Rating, has 873 NaN's.
Cleaning Advisory Rating, has 316 NaN's.
Cleaning Available to Public on, has 168 NaN's.
Cleaning Rotten Tomatoes Score, has 586 NaN's.
Cleaning Cast, has 387 NaN's.
Cleaning Available Languages, has 705 NaN's.


In [15]:
print(merged_df.columns)

Index(['Title', 'Type_x_x', 'Advisory Rating', 'Year',
       'Available to Public on', 'Runtime_x', 'Genre', 'Director_x', 'Cast',
       'Available Languages', 'Country_x', 'Awards', 'Metascore',
       'Imdb_Rating', 'Imdb_Votes', 'Type_y_x', 'Director_y', 'Country_y',
       'Release_Year', 'Duration', 'Listed_In', 'Year_x',
       'Age Advisory Rating', 'Imdb_x', 'Rotten Tomatoes Score', 'Type_x_y',
       'Directors', 'Genres', 'Country', 'Runtime_y', 'Year_y', 'Imdb_y',
       'Type_y_y'],
      dtype='object')


### Condensing 3-Columns

In [16]:
#variables for for loop
anchor_ls = ['Runtime_x', 'Genre', 'Director_x', 'Country_x', 'Imdb_Rating']
anchor_name_ls = ['Offering Duration', 'Genre', 'Director', 'Country', 'IMDB Score']
column_ls = [['Duration', 'Runtime_y'], ['Listed_In', 'Genres'], ['Director_y', 'Directors'], ['Country_y', 'Country'], ['Imdb_x', 'Imdb_y']]
boolean_ls = [False, True, False, False, False]

In [17]:
#Runtime, Genre, Director, Country
for i in range(len(anchor_ls)):
    dropped_list = [] #reseting for every df
    combined_3_columns(merged_df, anchor_ls[i], anchor_name_ls[i], column_ls[i], dropped_list, boolean_ls[i])

Cleaning Offering Duration, has 189 NaN's.
Cleaning Genre, dropped 169.
Cleaning Director, has 678 NaN's.
Cleaning Country, has 329 NaN's.
Cleaning IMDB Score, has 529 NaN's.


In [18]:
print(merged_df.columns)

Index(['Title', 'Type_x_x', 'Advisory Rating', 'Year',
       'Available to Public on', 'Offering Duration', 'Genre', 'Director',
       'Cast', 'Available Languages', 'Country', 'Awards', 'Metascore',
       'IMDB Score', 'Imdb_Votes', 'Type_y_x', 'Release_Year', 'Listed_In',
       'Year_x', 'Age Advisory Rating', 'Rotten Tomatoes Score', 'Type_x_y',
       'Genres', 'Year_y', 'Type_y_y'],
      dtype='object')


### Condensing 4-Columns

In [19]:
#finding unique categorical values for Type
print(f"Unique categorical values are {merged_df['Type_x_x'].append(merged_df['Type_y_x']).append(merged_df['Type_x_y']).append(merged_df['Type_y_y']).unique()}")

#standardizing values
merged_df['Type_x_y'].replace(0, 'Movie', inplace = True)
merged_df['Type_y_y'].replace(1, 'TV Show', inplace = True)
merged_df['Type_x_x'].replace(['Series', 'Episode'], ['TV Show','TV Show'], inplace = True)

Unique categorical values are ['movie' 'series' nan 'episode' 'Movie' 'TV Show' 0.0 1.0]


In [20]:
#variables for for loop
anchor_ls = ['Type_x_x', 'Year']
anchor_name_ls = ['Offering Type', 'Year Released']
column_ls = [['Type_y_x', 'Type_x_y', 'Type_y_y'], ['Release_Year', 'Year_x', 'Year_y']]
boolean_ls = [True, False]

In [21]:
#Type and Release Year
for i in range(len(anchor_ls)):
    dropped_list = [] #reseting for every df
    combined_4_columns(merged_df, anchor_ls[i], anchor_name_ls[i], column_ls[i], dropped_list, boolean_ls[i])

Cleaning Offering Type, dropped 98.
Cleaning Year Released, has 98 NaN's.


In [22]:
print(merged_df.columns)

Index(['Title', 'Type_x_x', 'Advisory Rating', 'Year Released',
       'Available to Public on', 'Offering Duration', 'Genre', 'Director',
       'Cast', 'Available Languages', 'Country', 'Awards', 'Metascore',
       'IMDB Score', 'Imdb_Votes', 'Type_y_x', 'Listed_In',
       'Age Advisory Rating', 'Rotten Tomatoes Score', 'Type_x_y', 'Genres',
       'Type_y_y'],
      dtype='object')


### Post Cleaning

In [None]:
#after manually reviewing file, still some cleaning to do
#manual adjustment
merged_df.drop(['Type_y_x', 'Listed_In', 'Type_x_y', 'Genres', 'Type_y_y'], axis = 1, inplace = True)
merged_df.rename(columns = {'Type_x_x' : 'Offering Medium', 'Imdb_Votes' : 'IMDB Votes'}, inplace = True)

In [None]:
#dropping the NaNs that didn't get picked up
for i in range(len(merged_df['Title'])):
    if (pd.isnull(merged_df.loc[i, 'Title']) | pd.isnull(merged_df.loc[i, 'Genre'])):
        merged_df.drop(i, axis = 0, inplace = True)

In [44]:
merged_df['Year Released'].values

array(['1999', '2018–', '1996', '2002', '2000', '2011', '1954', '1998',
       '2020', '1995', '1995', '1946', '2002', '1977', '2018', '2019',
       '1992', '1996', '1994', '1951', '2010', '2005', '1949', '2000',
       '1962', '1989–', '2004', '2015–', '1981', '2000', '2017–2019',
       '1999', '2019', '2004', '2017', '2003', '2001', '2018',
       '2011–2016', '2009', '1932', '1961', '1942', '2006', '2018',
       '1939', '1955', '1991', '1998', '1997', '2005', '1971', '2008',
       '1955', '2016', '1987', '2015–2016', '1988', '2018–', '2014',
       '2017–', '2009', '1968', '1994', '2018–', '1938', '2008',
       '1993–1994', '2003', '1993–2000', '2011–', '2012', '1997', '2017',
       '1998', '2003', '2006', '2005', '2018–', '2008', '2002', '1994',
       '2008', '2010', '1999', '1977', '1941', '2006', '2011', '2017',
       '2008', '2010', '2008', '2013', '2012', '2008', '2009', '2013',
       '1986', '2003', '1989', '1941', '2005', '1988–1990', '1947',
       '1950', '2001', '

In [32]:
merged_df['Year Released'].replace('–', '')

0       1999
1      2018–
2       1996
3       2002
4       2000
       ...  
987     2014
988     1999
989     2001
990     2004
991     2016
Name: Year Released, Length: 885, dtype: object

In [None]:
#saving file for manual check
merged_df.to_csv('clean_merge_df.csv')

# Summary Statistics
- total # offerings
- breadth of offering? # of subcategories etc.
- min/max/mean of duration of
- seasons
- then per episode
- min/max/mean of ... 
- customer rating
- critical reception

# BY SUBCATEGORY ()
- total # offerings
- breadth of offering? # of subcategories etc.
- min/max/mean of duration of
- seasons
- then per episode
- min/max/mean of ... 
- customer rating
- critical reception