In [1]:
import pandas as pd
import os

### data preview

In [2]:
data_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/data'

In [3]:
os.chdir(data_path)
profit = pd.read_csv('movie_profit.csv', index_col=0)

print(profit.columns)

# check the nan value in production budget col
print('nan value in production budget: ', profit['production_budget'].isna().sum())
profit.sample(10)

Index(['release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross', 'distributor', 'mpaa_rating', 'genre'],
      dtype='object')
nan value in production budget:  0


Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,distributor,mpaa_rating,genre
1637,7/2/1999,Summer of Sam,22000000.0,19288130.0,19288130.0,Walt Disney,R,Drama
3120,2/9/2007,The Last Sin Eater,2200000.0,388390.0,388390.0,20th Century Fox,PG-13,Drama
840,12/22/2017,Pitch Perfect 3,45000000.0,104897530.0,185736412.0,Universal,PG-13,Comedy
1213,11/10/2000,Men of Honor,32000000.0,48814909.0,82339483.0,20th Century Fox,R,Drama
2703,10/21/2011,The Mighty Macs,7000000.0,1891936.0,1891936.0,Quaker Media,G,Drama
957,1/25/2002,The Count of Monte Cristo,40000000.0,54228104.0,75389090.0,Walt Disney,PG-13,Drama
403,7/23/1999,Inspector Gadget,75000000.0,97387965.0,97387965.0,Walt Disney,PG,Adventure
401,6/7/1996,The Rock,75000000.0,134069511.0,336069511.0,Walt Disney,R,Action
2202,3/1/2013,21 and Over,13000000.0,25682380.0,42195766.0,Relativity,R,Comedy
2135,12/20/1989,Born on the Fourth of July,14000000.0,70001698.0,70001698.0,Universal,R,Drama


In [4]:
disney_plus = pd.read_csv('disney_plus_shows.csv')

print(disney_plus.columns)
disney_plus.sample(5)

Index(['imdb_id', 'title', 'plot', 'type', 'rated', 'year', 'released_at',
       'added_at', 'runtime', 'genre', 'director', 'writer', 'actors',
       'language', 'country', 'awards', 'metascore', 'imdb_rating',
       'imdb_votes'],
      dtype='object')


Unnamed: 0,imdb_id,title,plot,type,rated,year,released_at,added_at,runtime,genre,director,writer,actors,language,country,awards,metascore,imdb_rating,imdb_votes
952,tt0361089,Valiant,"At the height of World War II, a tiny wood pig...",movie,G,2005.0,19 Aug 2005,"November 12, 2019",76 min,"Animation, Adventure, Comedy, Family, War",Gary Chapman,"George Webster (story), Jordan Katz (screenpla...","Ewan McGregor, Ricky Gervais, Tim Curry, Jim B...",English,"UK, USA",2 nominations.,45.0,5.5,20118.0
761,tt0033356,The Art of Skiing,"Goofy, staying at the Sugar Bowl resort, demon...",movie,Approved,1941.0,14 Nov 1941,"November 12, 2019",8 min,"Animation, Short, Comedy, Family, Sport","Jack Kinney, Bill Roberts",,"George Johnson, John McLeish, Hannes Schroll",English,USA,,,7.3,1061.0
281,,,,,,,,"November 12, 2019",,,,,,,,,,,
298,tt7631348,Freaky Friday,A musical about what happens when a mom and da...,movie,TV-G,2018.0,10 Aug 2018,"November 12, 2019",90 min,"Comedy, Fantasy, Musical",Steve Carr,"Bridget Carpenter (musical book), Bridget Carp...","Cozi Zuehlsdorff, Heidi Blickenstaff, Jason Ma...",English,USA,,,3.7,916.0
354,tt0107120,Hocus Pocus,"A curious youngster moves to Salem, where he s...",movie,PG,1993.0,16 Jul 1993,"November 12, 2019",96 min,"Comedy, Family, Fantasy",Kenny Ortega,"David Kirschner (story), Mick Garris (story), ...","Bette Midler, Sarah Jessica Parker, Kathy Naji...","English, French",USA,2 wins & 11 nominations.,,6.9,89713.0


### match with profit dataset

In [5]:
profit.rename(columns={'movie': 'title'}, inplace=True)

disney_plus = pd.merge(disney_plus, profit[['title', 'production_budget', 'domestic_gross', 'worldwide_gross', 'distributor', 'mpaa_rating']], on='title', how='left')

# check the matched pairs
disney_plus['matched_profit_indicator'] = disney_plus['production_budget'].notnull().astype(int)
print(f'the original length: {disney_plus.__len__()}; the matched pairs: {disney_plus.matched_profit_indicator.sum()}')


the original length: 992; the matched pairs: 123


In [6]:
disney_plus.sample(5)

Unnamed: 0,imdb_id,title,plot,type,rated,year,released_at,added_at,runtime,genre,...,awards,metascore,imdb_rating,imdb_votes,production_budget,domestic_gross,worldwide_gross,distributor,mpaa_rating,matched_profit_indicator
521,tt0120645,Meet the Deedles,Two surfers end up as Yellowstone park rangers...,movie,PG,1998,27 Mar 1998,"November 12, 2019",93 min,"Comedy, Family",...,1 win & 2 nominations.,,4.1,2567.0,24000000.0,4356126.0,4356126.0,Walt Disney,PG,1
313,tt0131409,Geri's Game,Geri sets up a chess game to play his greatest...,movie,,1997,25 Nov 1997,"November 12, 2019",4 min,"Animation, Short, Family",...,Won 1 Oscar. Another 8 wins & 1 nomination.,,7.9,21679.0,,,,,,0
524,,,,,,,,"November 12, 2019",,,...,,,,,,,,,,0
82,,,,,,,,"May 1, 2020",,,...,,,,,,,,,,0
322,tt0103428,Goof Troop,The classic Disney character Goofy is a single...,series,TV-Y,1992–1993,05 Sep 1992,"November 12, 2019",30 min,"Animation, Adventure, Comedy, Family, Music",...,1 win & 2 nominations.,,6.8,7293.0,,,,,,0


### match within the review data

In [7]:
review_path = '/Users/ryan/Documents/GitHub/MGT4187-Project/data/review_data'
os.chdir(review_path)

audience = pd.read_csv('audience_reviews.csv')
print(audience.columns)
print(f'total length of audience review data: {audience.__len__()}; unique show: {audience.Show.unique().__len__()}')
audience.sample(5)

Index(['Show', 'Rating', 'Review'], dtype='object')
total length of audience review data: 65522; unique show: 563


Unnamed: 0,Show,Rating,Review
45435,Sex/Life,4.0,Ok some of the dialogue is super cheesy but I ...
44774,Squid Game,5.0,I had to binge the whole series after watching...
33587,Manifest,5.0,I love the show so many twists!
24124,Moon Knight,4.5,Currently the best MCU show!
11400,Game of Thrones,1.5,"A notable drop off in terms of storytelling, s..."


In [8]:
critic = pd.read_csv('critic_reviews.csv')
print(critic.columns)
print(f'total length of critic data: {critic.__len__()}; unique show: {critic.Show.unique().__len__()}')
critic.sample(5)

Index(['Show', 'Sentiment', 'Review'], dtype='object')
total length of critic data: 14791; unique show: 495


Unnamed: 0,Show,Sentiment,Review
7844,Dark Winds,1,McClarnon is perfect in the role of a man forc...
13791,Parks and Recreation,1,"Against all odds, though, A Parks and Recreati..."
213,Severance,1,The new Apple TV+ sci-fi series is brilliantly...
6219,Moon Knight,1,Oscar Isaac proves he is a powerhouse as he li...
3102,The Staircase,1,The cast here isn’t just uniformly excellent; ...


In [9]:
tv_show = pd.read_csv('tv_show_links.csv')
print(tv_show.columns)
print(f'total length of tv_show data: {tv_show.__len__()}')
tv_show.sample(5)

Index(['Network', 'Show', 'Critic Score', 'Audience Score', 'tv_link'], dtype='object')
total length of tv_show data: 885


Unnamed: 0,Network,Show,Critic Score,Audience Score,tv_link
241,HBO MAX,The Sex Lives of College Girls,97%,76%,https://www.rottentomatoes.com/tv/the_sex_live...
536,Hulu,The Simpsons,85%,75%,https://www.rottentomatoes.com/tv/the_simpsons
379,Disney+,Marvel's 616,100%,69%,https://www.rottentomatoes.com/tv/marvels_616
739,Peacock,Everybody Hates Chris,95%,98%,https://www.rottentomatoes.com/tv/everybody_ha...
762,Peacock,Unsolved Mysteries,77%,69%,https://www.rottentomatoes.com/tv/unsolved_mys...


#### merge based on tv_show dataset

In [10]:
# grouping the review data first

## audience data
grouped_audience = audience.groupby('Show', dropna=False)

agg_audience_nob = pd.DataFrame({
        'nob_audience': grouped_audience.size()
}).reset_index()

agg_audience = grouped_audience.agg({
    'Rating': lambda x: list(pd.unique(x)), 
    'Review': lambda x: list(pd.unique(x))
})

audience_data = pd.merge(agg_audience_nob, agg_audience, on='Show')
audience_data.sample(5)

Unnamed: 0,Show,nob_audience,Rating,Review
516,Two and a Half Men,17,"[5.0, 2.0, 2.5, 1.0, 0.5]","[Aunque no es la mejor temporada, se mantiene ..."
203,Little America,36,"[5.0, 2.0, 3.5, 4.0, 1.0, 0.5, 4.5]",[Excellent viewing. Cannot recommend this enou...
493,The White Lotus,206,"[5.0, 3.5, 2.0, 4.5, 3.0, 0.5, 4.0, 1.5, 1.0, ...",[Brilliant idea to weave these characters into...
355,Star Trek: Discovery,471,"[0.5, 1.5, 2.0, 1.0, 3.0, 2.5, 5.0, 3.5, 4.0, ...","[Wow, it just keeps getting worse and worse. ..."
392,The Affair,35,"[4.5, 1.0, 0.5, 2.0, 4.0, 5.0, 3.5]",[I thought this was a nice ending to the show....


In [11]:
## critics data
grouped_critics = critic.groupby('Show', dropna=False)

agg_critics_nob = pd.DataFrame({
        'nob_critics': grouped_critics.size()
}).reset_index()
agg_critics = grouped_critics.agg({
    'Sentiment': lambda x: list(pd.unique(x)), 
    'Review': lambda x: list(pd.unique(x))
})

critics_data = pd.merge(agg_critics_nob, agg_critics, on='Show')
critics_data.sample(5)

Unnamed: 0,Show,nob_critics,Sentiment,Review
43,Better Things,16,[1],"[Over five seasons, it never failed to make me..."
442,Titans,20,[1],"[there are enough surprises, fatalities, and c..."
365,The First Lady,43,"[0, 1]",[The plaiting of the three narratives occurs a...
7,8 Simple Rules,1,[1],[It's also a heart-warming and optimistic remi...
40,Behind Her Eyes,37,"[1, 0]","[A well-structured, complex, attractive story...."


In [12]:
audience_data.rename(columns={'Review': 'Review_audience'}, inplace=True)
tv_show = pd.merge(tv_show, audience_data, on='Show', how='left')

critics_data.rename(columns={'Review': 'Review_critics'}, inplace=True)
tv_show = pd.merge(tv_show, critics_data, on='Show', how='left')


tv_show['matched_audience_indicator'] = tv_show['Review_audience'].notnull().astype(int)
tv_show['matched_critics_indicator'] = tv_show['Review_critics'].notnull().astype(int)

print(f'the matched pairs from audience review: {tv_show.matched_audience_indicator.sum()}')
print(f'the matched pairs from critics: {tv_show.matched_critics_indicator.sum()}')

tv_show['matched_all_review_indicator'] = (tv_show['matched_audience_indicator'] & tv_show['matched_critics_indicator']).astype(int)
print(f'the matched results from all review data: {tv_show.matched_all_review_indicator.sum()}')

the matched pairs from audience review: 746
the matched pairs from critics: 673
the matched results from all review data: 657


#### finalize the review dataset

##### attach review information on profit information

In [13]:
# match with profit & production data

tv_show.rename(columns={'Show': 'title'}, inplace=True)
disney_plus = pd.merge(disney_plus, tv_show, on='title', how='left')


In [14]:
valid_data = disney_plus.loc[((disney_plus['matched_profit_indicator']==1)&(disney_plus['matched_all_review_indicator']==1))]
## question: only 7 entries...

##### attach profit information on review information

In [15]:
# match with profit & production data

tv_show.rename(columns={'Show': 'title'}, inplace=True)
tv_show = pd.merge(tv_show, disney_plus, on='title', how='left')

valid_data = tv_show.loc[((tv_show['matched_profit_indicator']==1)&(tv_show['matched_all_review_indicator']==1))]
print('total length of valid data: ', valid_data.__len__())

total length of valid data:  7
