In [86]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import itertools

## Загрузка DF и общая информация о нем

In [3]:
data = pd.read_csv('movie_bd_v5.csv', encoding = "utf-8")
data.head(10)
# trying to find right encoding
data.director = data.director.str.encode('latin1').str.decode('utf8') # worked for director column
data.original_title = data.original_title.str.encode('utf8').str.decode('utf8')

In [4]:
data.head(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
0,tt0369610,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,6.5,2015
1,tt1392190,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,7.1,2015
2,tt2908446,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,6.3,2015
3,tt2488496,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,7.5,2015
4,tt2820852,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,7.3,2015


In [5]:
data.describe()

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889 entries, 0 to 1888
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               1889 non-null   object 
 1   budget                1889 non-null   int64  
 2   revenue               1889 non-null   int64  
 3   original_title        1889 non-null   object 
 4   cast                  1889 non-null   object 
 5   director              1889 non-null   object 
 6   tagline               1889 non-null   object 
 7   overview              1889 non-null   object 
 8   runtime               1889 non-null   int64  
 9   genres                1889 non-null   object 
 10  production_companies  1889 non-null   object 
 11  release_date          1889 non-null   object 
 12  vote_average          1889 non-null   float64
 13  release_year          1889 non-null   int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 206.7+ KB


In [7]:
data.columns

Index(['imdb_id', 'budget', 'revenue', 'original_title', 'cast', 'director',
       'tagline', 'overview', 'runtime', 'genres', 'production_companies',
       'release_date', 'vote_average', 'release_year'],
      dtype='object')

# Предобработка

In [8]:
answers = {} # создадим словарь для ответов

# тут другие ваши предобработки колонок например:

#the time given in the dataset is in string format.
#So we need to change this in datetime format
# ...

### Переводим дату в datetime формат

In [9]:
data["release_date"] = pd.to_datetime(data["release_date"])

### Добавляем колонке month

In [10]:
data["month"] = data["release_date"].dt.month

### Добавляем колонку profit = revenue - budget

In [11]:
data["profit"] = data["revenue"] - data["budget"]

### Уберем ненужные колонки

In [12]:
data_new = data.drop(columns=["imdb_id", "tagline"])

### Извлечем информацию о жанрах для каждой картины

In [13]:
# сперва строки в колонке genres преобразуем в списки
data_new.genres = data_new.genres.apply(lambda x: x.split("|"))
# альтернативный метод без использования функции: метод assign
# data_new = data_new.assign(genres=data_new.genres.str.split('|'))

# применим метод explode, чтобы каждой строке соответствовал только один жанр
data_gen = data_new.explode("genres", ignore_index=True)

In [14]:
# смотрим, что получилось
data_gen.head(6)

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
0,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Twenty-two years after the events of Jurassic ...,124,Action,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,6.5,2015,6,1363528810
1,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Twenty-two years after the events of Jurassic ...,124,Adventure,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,6.5,2015,6,1363528810
2,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Twenty-two years after the events of Jurassic ...,124,Science Fiction,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,6.5,2015,6,1363528810
3,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Twenty-two years after the events of Jurassic ...,124,Thriller,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,6.5,2015,6,1363528810
4,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,An apocalyptic story set in the furthest reach...,120,Action,Village Roadshow Pictures|Kennedy Miller Produ...,2015-05-13,7.1,2015,5,228436354
5,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,An apocalyptic story set in the furthest reach...,120,Adventure,Village Roadshow Pictures|Kennedy Miller Produ...,2015-05-13,7.1,2015,5,228436354


### Извлечем информацию об режиссерах для каждой картины

In [15]:
data_new = data_new.assign(director=data_new.director.str.split('|'))

In [16]:
data_dir = data_new.explode("director", ignore_index=True)

### ... о студиях

In [17]:
data_new = data_new.assign(production_companies=data_new.production_companies.str.split('|'))
data_comp = data_new.explode("production_companies", ignore_index=True)

### ... и об актерах

In [18]:
data_new = data_new.assign(cast=data_new.cast.str.split('|'))
data_cast = data_new.explode("cast", ignore_index=True)

### Для вопроса 14: Оставим только фильмы с жанром Action

In [19]:
data_dir.genres = data_dir.genres.apply(lambda x: "|".join(x))

In [20]:
data_dir_action = data_dir[data_dir.genres.str.contains("Action")]

### Для вопроса 17: Оставим только фильмы, где снимался N.Cage

In [21]:
data_gen_cage = data_gen[data_gen.cast.str.contains("Nicolas Cage")]

In [22]:
data_gen_cage.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93 entries, 455 to 4973
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                93 non-null     int64         
 1   revenue               93 non-null     int64         
 2   original_title        93 non-null     object        
 3   cast                  93 non-null     object        
 4   director              93 non-null     object        
 5   overview              93 non-null     object        
 6   runtime               93 non-null     int64         
 7   genres                93 non-null     object        
 8   production_companies  93 non-null     object        
 9   release_date          93 non-null     datetime64[ns]
 10  vote_average          93 non-null     float64       
 11  release_year          93 non-null     int64         
 12  month                 93 non-null     int64         
 13  profit            

### Для вопроса 24: Добавим столбец для количества символов в названии фильма

In [23]:
data_comp["title_len"] = data_comp.original_title.apply(lambda x: len(x))

### Для вопроса 25: Добавим столбец для количества слов в описании фильма

In [24]:
data_comp["overview_len"] = data_comp.overview.apply(lambda x: len(x.split()))

In [25]:
data_comp.head()

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit,title_len,overview_len
0,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,[Colin Trevorrow],Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]",Universal Studios,2015-06-09,6.5,2015,6,1363528810,14,26
1,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,[Colin Trevorrow],Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]",Amblin Entertainment,2015-06-09,6.5,2015,6,1363528810,14,26
2,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,[Colin Trevorrow],Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]",Legendary Pictures,2015-06-09,6.5,2015,6,1363528810,14,26
3,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,[Colin Trevorrow],Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]",Fuji Television Network,2015-06-09,6.5,2015,6,1363528810,14,26
4,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,[Colin Trevorrow],Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]",Dentsu,2015-06-09,6.5,2015,6,1363528810,14,26


### Для вопроса 27

In [88]:
# Будем работать с серией data["cast"]
data_cast_ser = data.cast

In [91]:
# Преобразуем строки в множества
data_cast_ser = data_cast_ser.apply(lambda x: set(x.split("|")))

In [93]:
# С помощью функции combinations из библиотеки itertools сравним множества и найдем их пересечения и поместим их в список
lst = []
for set1, set2 in itertools.combinations(data_cast_ser, 2):
    lst.append(set1.intersection(set2))
    
    

In [95]:
# Создадим новую серию из полученного списка
data_cast_ser_int = pd.Series(lst)

# Почистим новую серию: уберем пустые множества, а также множества с одним элементом
data_cast_ser_int = data_cast_ser_int.apply(lambda x: x if len(x) > 1 else np.NaN).dropna()



In [97]:
# В новой серии остались множества с двумя и более актерами, снимавшихся в двух и более фильмах вместе
data_cast_ser_int.head(10)

3903     {Kate Winslet, Shailene Woodley, Miles Teller,...
3949                      {Ansel Elgort, Shailene Woodley}
7879         {Michelle Rodriguez, Vin Diesel, Paul Walker}
8117         {Michelle Rodriguez, Vin Diesel, Paul Walker}
8392             {Vin Diesel, Paul Walker, Dwayne Johnson}
8838             {Vin Diesel, Paul Walker, Dwayne Johnson}
9794                        {Leonardo DiCaprio, Tom Hardy}
19798                        {Ralph Fiennes, Daniel Craig}
24521                          {Adam Sandler, Kevin James}
25475                          {Adam Sandler, Kevin James}
dtype: object

# 1. У какого фильма из списка самый большой бюджет?

Использовать варианты ответов в коде решения запрещено.    
Вы думаете и в жизни у вас будут варианты ответов?)

In [26]:
# в словарь вставляем номер вопроса и ваш ответ на него
# Пример: 
# answers['1'] = '2. Spider-Man 3 (tt0413300)'
# запишите свой вариант ответа
answers['1'] = 'Pirates of the Caribbean: On Stranger Tides'
# если ответили верно, можете добавить комментарий со значком "+"

In [27]:
data_new[data_new.budget == data_new.budget.max()]

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
723,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,"[Johnny Depp, PenÃ©lope Cruz, Geoffrey Rush, I...",[Rob Marshall],Captain Jack Sparrow crosses paths with a woma...,136,"[Adventure, Action, Fantasy]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2011-05-11,6.3,2011,5,641683000


##### ВАРИАНТ 2

In [28]:
data_new.sort_values("budget", ascending=False).head(3)

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
723,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,"[Johnny Depp, PenÃ©lope Cruz, Geoffrey Rush, I...",[Rob Marshall],Captain Jack Sparrow crosses paths with a woma...,136,"[Adventure, Action, Fantasy]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2011-05-11,6.3,2011,5,641683000
1669,300000000,961000000,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski],"Captain Barbossa, long believed to be dead, ha...",169,"[Adventure, Fantasy, Action]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2007-05-19,6.8,2007,5,661000000
14,280000000,1405035767,Avengers: Age of Ultron,"[Robert Downey Jr., Chris Hemsworth, Mark Ruff...",[Joss Whedon],When Tony Stark tries to jumpstart a dormant p...,141,"[Action, Adventure, Science Fiction]","[Marvel Studios, Prime Focus, Revolution Sun S...",2015-04-22,7.4,2015,4,1125035767


# 2. Какой из фильмов самый длительный (в минутах)?

In [29]:
# думаю логику работы с этим словарем вы уже поняли, 
# по этому не буду больше его дублировать
answers['2'] = 'Gods and Generals'

In [30]:
data_new.sort_values("runtime", ascending=False).head(3)

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
1157,56000000,12923936,Gods and Generals,"[Stephen Lang, Jeff Daniels, Robert Duvall, Ke...",[Ronald F. Maxwell],The film centers mostly around the personal an...,214,"[Drama, History, War]","[Turner Pictures, Antietam Filmworks]",2003-02-21,5.8,2003,2,-43076064
1081,94000000,1118888979,The Lord of the Rings: The Return of the King,"[Elijah Wood, Ian McKellen, Viggo Mortensen, L...",[Peter Jackson],Aragorn is revealed as the heir to the ancient...,201,"[Adventure, Fantasy, Action]","[WingNut Films, New Line Cinema]",2003-12-01,7.9,2003,12,1024888979
1736,67000000,25037897,Grindhouse,"[Kurt Russell, ZoÃ« Bell, Rosario Dawson, Vane...","[Robert Rodriguez, Eli Roth, Quentin Tarantino...",Two full length feature horror movies written ...,191,"[Thriller, Action, Horror]","[Big Talk Productions, Yer Dead Productions, W...",2007-04-06,6.5,2007,4,-41962103


# 3. Какой из фильмов самый короткий (в минутах)?





In [31]:
answers['3'] = 'Winnie the Pooh'

In [32]:
data_new[data_new.runtime == data_new.runtime.min()]

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
768,30000000,14460000,Winnie the Pooh,"[Jim Cummings, Travis Oates, Jim Cummings, Bud...","[Stephen Anderson, Don Hall]","During an ordinary day in Hundred Acre Wood, W...",63,"[Animation, Family]","[Walt Disney Pictures, Walt Disney Animation S...",2011-04-13,6.8,2011,4,-15540000


# 4. Какова средняя длительность фильмов?


In [33]:
answers['4'] = '110'

In [34]:
round(data_new.runtime.mean())

110

# 5. Каково медианное значение длительности фильмов? 

In [35]:
answers['5'] = '107'

In [36]:
data_new.runtime.median()

107.0

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget) 

In [37]:
answers['6'] = 'Avatar'

In [38]:
data_new.sort_values("profit", ascending = False).head(3)

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
239,237000000,2781505847,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"In the 22nd century, a paraplegic Marine is di...",162,"[Action, Adventure, Fantasy, Science Fiction]","[Ingenious Film Partners, Twentieth Century Fo...",2009-12-10,7.1,2009,12,2544505847
3,200000000,2068178225,Star Wars: The Force Awakens,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",[J.J. Abrams],Thirty years after defeating the Galactic Empi...,136,"[Action, Adventure, Science Fiction, Fantasy]","[Lucasfilm, Truenorth Productions, Bad Robot]",2015-12-15,7.5,2015,12,1868178225
0,150000000,1513528810,Jurassic World,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",[Colin Trevorrow],Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]","[Universal Studios, Amblin Entertainment, Lege...",2015-06-09,6.5,2015,6,1363528810


# 7. Какой фильм самый убыточный? 

In [39]:
answers['7'] = 'The Lone Ranger'

In [40]:
data_new[data_new.profit == data_new.profit.min()]

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
1245,255000000,89289910,The Lone Ranger,"[Johnny Depp, Armie Hammer, William Fichtner, ...",[Gore Verbinski],The Texas Rangers chase down a gang of outlaws...,149,"[Action, Adventure, Western]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2013-07-03,6.0,2013,7,-165710090


# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [41]:
answers['8'] = '1478'

In [42]:
data_new[data_new.revenue > data_new.budget].original_title.count()

1478

##### ВАРИАНТ 2

In [43]:
data_new.query("revenue > budget").original_title.count()

1478

# 9. Какой фильм оказался самым кассовым в 2008 году?

In [44]:
answers['9'] = 'The Dark Knight'

In [45]:
data_new[data_new.release_year == 2008].sort_values("revenue", ascending=False).head(3)

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
599,185000000,1001921825,The Dark Knight,"[Christian Bale, Michael Caine, Heath Ledger, ...",[Christopher Nolan],Batman raises the stakes in his war on crime. ...,152,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Warner Bros., ...",2008-07-16,8.1,2008,7,816921825
603,185000000,786636033,Indiana Jones and the Kingdom of the Crystal S...,"[Harrison Ford, Cate Blanchett, Shia LaBeouf, ...",[Steven Spielberg],"Set during the Cold War, the Soviets â€“ led b...",122,"[Adventure, Action]","[Lucasfilm, Paramount Pictures]",2008-05-21,5.6,2008,5,601636033
606,130000000,631744560,Kung Fu Panda,"[Jack Black, Dustin Hoffman, Angelina Jolie, J...","[Mark Osborne, John Stevenson]","When the Valley of Peace is threatened, lazy P...",90,"[Animation, Family]",[DreamWorks Animation],2008-06-04,6.8,2008,6,501744560


# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [46]:
answers['10'] = 'The Lone Ranger'

In [47]:
data_new.query("2012 <= release_year <= 2014").sort_values("profit").head(3)

Unnamed: 0,budget,revenue,original_title,cast,director,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
1245,255000000,89289910,The Lone Ranger,"[Johnny Depp, Armie Hammer, William Fichtner, ...",[Gore Verbinski],The Texas Rangers chase down a gang of outlaws...,149,"[Action, Adventure, Western]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2013-07-03,6.0,2013,7,-165710090
1214,130000000,61648500,R.I.P.D.,"[Jeff Bridges, Ryan Reynolds, Kevin Bacon, Ste...",[Robert Schwentke],A recently slain cop joins a team of undead po...,96,"[Fantasy, Action, Comedy, Crime]","[Universal Pictures, Original Film, Dark Horse...",2013-07-18,5.3,2013,7,-68351500
1007,60000000,8106475,Upside Down,"[Kirsten Dunst, Jim Sturgess, Timothy Spall, J...",[Juan Diego Solanas],In an alternate universe where twinned worlds ...,104,"[Romance, Science Fiction, Drama, Fantasy]","[Onyx Films, Studio 37, Jouror Productions, Fr...",2012-09-27,6.1,2012,9,-51893525


In [48]:
# data_new.query("2012 <= release_year <= 2014").sort_values("profit").head(3)

# 11. Какого жанра фильмов больше всего?

In [49]:
answers['11'] = 'Drama'

In [50]:
data_gen.genres.value_counts()

Drama              782
Comedy             683
Thriller           596
Action             582
Adventure          415
Crime              315
Romance            308
Family             260
Science Fiction    248
Fantasy            222
Horror             176
Mystery            168
Animation          139
Music               64
History             62
War                 58
Western             19
Documentary          8
Foreign              2
Name: genres, dtype: int64

ВАРИАНТ 2

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [51]:
answers['12'] = 'Drama'

In [52]:
data_gen[data_gen.profit > 0].genres.value_counts()

Drama              560
Comedy             551
Thriller           446
Action             444
Adventure          337
Romance            242
Crime              231
Family             226
Science Fiction    195
Fantasy            188
Horror             150
Animation          120
Mystery            119
Music               47
History             46
War                 41
Western             12
Documentary          7
Name: genres, dtype: int64

# 13. У какого режиссера самые большие суммарные кассовые сбооры?

In [53]:
answers['13'] = 'Peter Jackson'

In [54]:
data_dir.groupby("director")["revenue"].sum().sort_values(ascending=False).head(3)

director
Peter Jackson        6490593685
Christopher Nolan    4167548502
David Yates          4154295625
Name: revenue, dtype: int64

# 14. Какой режисер снял больше всего фильмов в стиле Action?

In [55]:
answers['14'] = 'Robert Rodriguez'

In [56]:
data_dir_action.director.value_counts().head()

Robert Rodriguez      9
Paul W.S. Anderson    7
Michael Bay           7
Ridley Scott          6
Antoine Fuqua         6
Name: director, dtype: int64

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [57]:
answers['15'] = 'Chris Hemsworth'

In [58]:
data_cast[data_cast.release_year == 2012].groupby("cast")["revenue"].sum().sort_values(ascending=False).head()

cast
Chris Hemsworth      2027450773
Denis Leary          1629460639
Anne Hathaway        1522851057
Chris Evans          1519557910
Robert Downey Jr.    1519557910
Name: revenue, dtype: int64

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [59]:
answers['16'] = '_____'

In [83]:
# высокобюджетный: revenue > revenue.mean()
data_cast[data_cast.budget > data_cast.budget.mean()].groupby("cast")["original_title"].count().sort_values(ascending=False).head()

cast
Matt Damon           18
Adam Sandler         17
Angelina Jolie       16
Eddie Murphy         15
Samuel L. Jackson    15
Name: original_title, dtype: int64

In [61]:
data_cast[(data_cast.revenue > data_cast.revenue.mean()) & (data_cast.cast.str"Tom Cruise")].original_title.count()

14

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [299]:
answers['17'] = 'Action'

In [300]:
data_gen_cage.genres.value_counts()

Action             17
Thriller           15
Drama              12
Crime              10
Fantasy             8
Adventure           7
Comedy              6
Science Fiction     4
Mystery             3
Animation           3
Family              3
History             2
Horror              1
Romance             1
War                 1
Name: genres, dtype: int64

# 18. Самый убыточный фильм от Paramount Pictures

In [301]:
answers['18'] = 'K-19: The Widowmaker'

In [302]:
data_comp[data_comp.production_companies == "Paramount Pictures"].groupby("original_title")["profit"].sum().sort_values()

original_title
K-19: The Widowmaker                                  -64831034
Timeline                                              -60519261
Next                                                  -51788987
Alfie                                                 -46604061
Twisted                                               -24805000
                                                        ...    
Mission: Impossible - Ghost Protocol                  549713380
Transformers                                          559709780
Indiana Jones and the Kingdom of the Crystal Skull    601636033
Transformers: Revenge of the Fallen                   686297228
Transformers: Dark of the Moon                        928746996
Name: profit, Length: 122, dtype: int64

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [303]:
answers['19'] = '2015'

In [304]:
data_new.groupby("release_year")["revenue"].sum().sort_values(ascending=False).head()

release_year
2015    25449202382
2014    23405862953
2013    23213799791
2012    23079001687
2011    22676791872
Name: revenue, dtype: int64

# 20. Какой самый прибыльный год для студии Warner Bros?

In [305]:
answers['20'] = '2014'

In [82]:
data_comp[data_comp.production_companies.str.contains("Warner Bros.")].groupby("release_year")["profit"].sum().sort_values(ascending=False).head()

release_year
2014    2292949646
2007    2201675217
2008    2134595031
2010    1974712985
2011    1871393682
Name: profit, dtype: int64

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [307]:
answers['21'] = 'Сентябрь'

In [308]:
data_new.month.value_counts()

9     227
12    190
10    186
8     161
3     156
4     149
6     147
11    146
7     142
5     140
2     135
1     110
Name: month, dtype: int64

##### ВАРИАНТ 2

In [309]:
data_new.groupby("month")["original_title"].count().sort_values(ascending=False)

month
9     227
12    190
10    186
8     161
3     156
4     149
6     147
11    146
7     142
5     140
2     135
1     110
Name: original_title, dtype: int64

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [310]:
answers['22'] = '450'

In [311]:
data_new[data_new.month.isin([6, 7, 8])].original_title.count()

450

# 23. Для какого режиссера зима – самое продуктивное время года? 

In [313]:
answers['23'] = 'Aaron Seltzer'

In [85]:
data_dir[data_dir.month.isin([12, 1, 2])].groupby("director")["original_title"].count().sort_values(ascending=False)[:4]

director
Peter Jackson        7
Steven Soderbergh    6
Clint Eastwood       6
Martin Scorsese      4
Name: original_title, dtype: int64

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

In [0]:
answers['24'] = 'Four By Two Productions'

In [361]:
data_comp.groupby("production_companies")["title_len"].mean().sort_values(ascending=False)

production_companies
Four By Two Productions       83.0
Jim Henson Company, The       59.0
Dos Corazones                 47.0
Museum Canada Productions     46.0
Polsky Films                  46.0
                              ... 
Everest Entertainment          3.0
Berlanti Productions           3.0
XM2 Productions                2.0
Ixtlan Productions             2.0
Global Entertainment Group     2.0
Name: title_len, Length: 1771, dtype: float64

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

In [0]:
answers['25'] = 'Midnight Picture Show'

In [371]:
data_comp.groupby("production_companies")["overview_len"].mean().sort_values(ascending=False)

production_companies
Midnight Picture Show                    175.0
Room 9 Entertainment                     161.0
98 MPH Productions                       159.0
Heineken Branded Entertainment           159.0
Brookwell-McNamara Entertainment         156.0
                                         ...  
London Boulevard                          13.0
Phantom Four                              13.0
Henceforth                                13.0
Empire Pictures                           11.0
Motion Picture Corporation of America     11.0
Name: overview_len, Length: 1771, dtype: float64

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
по vote_average

In [None]:
answers['26'] = 'Inside Out, The Dark Knight, 12 Years a Slave'

In [328]:
list(data_new.sort_values("vote_average", ascending=False).head(int(0.01*len(data_new)))["original_title"])

['The Dark Knight',
 'Interstellar',
 'The Imitation Game',
 'Inside Out',
 'Room',
 'The Wolf of Wall Street',
 'Gone Girl',
 '12 Years a Slave',
 'Guardians of the Galaxy',
 'The Lord of the Rings: The Return of the King',
 'Memento',
 'Inception',
 'The Pianist',
 'The Grand Budapest Hotel',
 'Her',
 'Spotlight',
 'Big Hero 6',
 'The Fault in Our Stars']

# 27. Какие актеры чаще всего снимаются в одном фильме вместе?


In [105]:
answers['27'] = 'Daniel Radcliffe & Rupert Grint'

In [98]:
data_cast_ser_int.value_counts(ascending=False)

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{Rupert Grint, Emma Watson, Daniel Radcliffe}                                             25
{Helena Bonham Carter, Johnny Depp}                                                       15
{Ben Stiller, Owen Wilson}                                                                13
{Adam Sandler, Kevin James}                                                                8
{Julia Roberts, Brad Pitt}                                                                 5
{Robert Pattinson, Kristen Stewart, Peter Facinelli, Taylor Lautner}                       5
{Chris Evans, Scarlett Johansson}                                                          5
{Matt Damon, George Clooney}                                                               5
{Paul Rudd, Steve Carell}                                                                  5
{Ian McKellen, Hugh Jackman}                                                               4
{Brad Pitt, George Clooney}                                           

In [99]:
# Чтобы jupyter не ругался, можно перевести множества в строки
data_cast_ser_int_str = data_cast_ser_int.apply(lambda x: "|".join(list(x))).value_counts(ascending=False)

In [100]:
data_cast_ser_int_str.head()

Rupert Grint|Emma Watson|Daniel Radcliffe    25
Helena Bonham Carter|Johnny Depp             15
Ben Stiller|Owen Wilson                      13
Adam Sandler|Kevin James                      8
Paul Rudd|Steve Carell                        5
dtype: int64

#### Примечание к вопросу 27

In [104]:
# Проверим, в скольких фильмах снимались Daniel Radcliffe и Rupert Grint вместе
data_radcliffe_grint = data[(data.cast.str.contains("Daniel Radcliffe")) & (data.cast.str.contains("Rupert Grint"))]
data_radcliffe_grint

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,month,profit
242,tt0417741,250000000,933959197,Harry Potter and the Half-Blood Prince,Daniel Radcliffe|Rupert Grint|Emma Watson|Tom ...,David Yates,Dark Secrets Revealed,"As Harry begins his sixth year at Hogwarts, he...",153,Adventure|Fantasy|Family,Warner Bros.|Heyday Films,2009-07-07,7.3,2009,7,683959197
374,tt0926084,250000000,954305868,Harry Potter and the Deathly Hallows: Part 1,Daniel Radcliffe|Emma Watson|Rupert Grint|Ralp...,David Yates,One Wayâ€¦ One Fateâ€¦ One Hero.,"Harry, Ron and Hermione walk away from their l...",146,Adventure|Fantasy|Family,Warner Bros.|Heyday Films,2010-10-17,7.4,2010,10,704305868
497,tt0241527,125000000,976475550,Harry Potter and the Philosopher's Stone,Daniel Radcliffe|Rupert Grint|Emma Watson|John...,Chris Columbus,Let the Magic Begin.,Harry Potter has lived under the stairs at his...,152,Adventure|Fantasy|Family,1492 Pictures|Warner Bros.|Heyday Films,2001-11-16,7.2,2001,11,851475550
722,tt1201607,125000000,1327817822,Harry Potter and the Deathly Hallows: Part 2,Daniel Radcliffe|Rupert Grint|Emma Watson|Alan...,David Yates,It all ends here.,"Harry, Ron and Hermione continue their quest t...",130,Adventure|Family|Fantasy,Warner Bros.|Heyday Films|Moving Picture Compa...,2011-07-07,7.7,2011,7,1202817822
864,tt0295297,100000000,876688482,Harry Potter and the Chamber of Secrets,Daniel Radcliffe|Rupert Grint|Emma Watson|Kenn...,Chris Columbus,Hogwarts is back in session.,"Ignoring threats to his life, Harry returns to...",161,Adventure|Fantasy|Family,1492 Pictures|Warner Bros.|Heyday Films|MIRACL...,2002-11-13,7.2,2002,11,776688482
1312,tt0330373,150000000,895921036,Harry Potter and the Goblet of Fire,Daniel Radcliffe|Rupert Grint|Emma Watson|Ralp...,Mike Newell,Dark And Difficult Times Lie Ahead.,"Harry starts his fourth year at Hogwarts, comp...",157,Adventure|Fantasy|Family,Patalex IV Productions Limited|Warner Bros.|He...,2005-11-05,7.3,2005,11,745921036
1560,tt0304141,130000000,789804554,Harry Potter and the Prisoner of Azkaban,Daniel Radcliffe|Rupert Grint|Emma Watson|Gary...,Alfonso Cuarón,Something wicked this way comes.,"Harry, Ron and Hermione return to Hogwarts for...",141,Adventure|Fantasy|Family,1492 Pictures|Warner Bros.|Heyday Films|P of A...,2004-05-31,7.4,2004,5,659804554
1670,tt0373889,150000000,938212738,Harry Potter and the Order of the Phoenix,Daniel Radcliffe|Rupert Grint|Emma Watson|Mich...,David Yates,Evil Must Be Confronted.,Returning for his fifth year of study at Hogwa...,138,Adventure|Fantasy|Family|Mystery,Cool Music|Warner Bros.|Heyday Films|Harry Pot...,2007-06-28,7.2,2007,6,788212738


In [None]:
# Всего в восьми экранизациях романов Дж.Роулинг о Гарри Поттере.
# Тогда откуда число 25 в ниших результатах? 
# Функция "combinations('ABCD', 2)" возвращает все парные, неповторяющиеся комбинации для заданных элеметов
# В случае с n=8 элементов, должно быть 8! / 2! / (8-2)! = 28 комбинаций.
# Мне не удалось выяснить, куда исчезли три комбинации из моих результатов...
# К счастью, это не повлияло на правильность ответа.

ВАРИАНТ 2

# Submission

In [363]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{}

In [0]:
# и убедиться что ни чего не пропустил)
len(answers)