# TP1 Recommender Systems

## Part 1 : Manipulating Data with Pandas Library

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np

In [2]:
small_df  = pd.read_csv('movies_metadata.csv', usecols= ["title","release_date","budget","revenue","runtime","genres"])

In [3]:
small_df.head()

Unnamed: 0,budget,genres,release_date,revenue,runtime,title
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,373554033.0,81.0,Toy Story
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,262797249.0,104.0,Jumanji
2,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,0.0,101.0,Grumpier Old Men
3,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,81452156.0,127.0,Waiting to Exhale
4,0,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,76578911.0,106.0,Father of the Bride Part II


In [4]:
type(small_df['budget'])

pandas.core.series.Series

In [5]:
small_df['budget'] = pd.to_numeric(small_df['budget'], errors='coerce')

In [6]:
small_df.head()

Unnamed: 0,budget,genres,release_date,revenue,runtime,title
0,30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,373554033.0,81.0,Toy Story
1,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,262797249.0,104.0,Jumanji
2,0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,0.0,101.0,Grumpier Old Men
3,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,81452156.0,127.0,Waiting to Exhale
4,0.0,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,76578911.0,106.0,Father of the Bride Part II


In [7]:
type(small_df['release_date'])

pandas.core.series.Series

In [8]:
small_df['release_date'] = pd.to_datetime(small_df['release_date'], errors='coerce')

In [9]:
small_df['release_date']

0       1995-10-30
1       1995-12-15
2       1995-12-22
3       1995-12-22
4       1995-02-10
           ...    
45461          NaT
45462   2011-11-17
45463   2003-08-01
45464   1917-10-21
45465   2017-06-09
Name: release_date, Length: 45466, dtype: datetime64[ns]

In [10]:
small_df['release_date'].dt.year

0        1995.0
1        1995.0
2        1995.0
3        1995.0
4        1995.0
          ...  
45461       NaN
45462    2011.0
45463    2003.0
45464    1917.0
45465    2017.0
Name: release_date, Length: 45466, dtype: float64

In [11]:
small_df['year'] = small_df['release_date'].dt.year

In [12]:
small_df.head()

Unnamed: 0,budget,genres,release_date,revenue,runtime,title,year
0,30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,373554033.0,81.0,Toy Story,1995.0
1,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,262797249.0,104.0,Jumanji,1995.0
2,0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,0.0,101.0,Grumpier Old Men,1995.0
3,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,81452156.0,127.0,Waiting to Exhale,1995.0
4,0.0,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,76578911.0,106.0,Father of the Bride Part II,1995.0


In [13]:
small_df['year'].min  

<bound method Series.min of 0        1995.0
1        1995.0
2        1995.0
3        1995.0
4        1995.0
          ...  
45461       NaN
45462    2011.0
45463    2003.0
45464    1917.0
45465    2017.0
Name: year, Length: 45466, dtype: float64>

In [14]:
small_df.iloc[0, 4]

81.0

In [15]:
small_df.groupby(["title"])["year"].max().sort_values(ascending=True)[:10]

title
Passage of Venus                 1874.0
Sallie Gardner at a Gallop       1878.0
Buffalo Running                  1883.0
Man Walking Around a Corner      1887.0
Accordion Player                 1888.0
Traffic Crossing Leeds Bridge    1888.0
London's Trafalgar Square        1890.0
Monkeyshines, No. 3              1890.0
Monkeyshines, No. 2              1890.0
Mosquinha                        1890.0
Name: year, dtype: float64

Les films les plus vieux sont Passage of Venus, Sallie Gardner at a Gallop et Man Walking Around a Corner.

In [16]:
small_df.groupby(["title"])["revenue"].max().sort_values(ascending=False)[:10]

title
Avatar                                          2.787965e+09
Star Wars: The Force Awakens                    2.068224e+09
Titanic                                         1.845034e+09
The Avengers                                    1.519558e+09
Jurassic World                                  1.513529e+09
Furious 7                                       1.506249e+09
Avengers: Age of Ultron                         1.405404e+09
Harry Potter and the Deathly Hallows: Part 2    1.342000e+09
Frozen                                          1.274219e+09
Beauty and the Beast                            1.262886e+09
Name: revenue, dtype: float64

Les films qui ont reçu le plus de succès sont Avatar, Star Wars et Titanic.

In [17]:
#On créer un dataframe avec les films présentant un revenu supérieur à 1 milliard
new_df = small_df['revenue'] >= 1000000000
filtered_df = small_df[new_df]
filtered_df

Unnamed: 0,budget,genres,release_date,revenue,runtime,title,year
1639,200000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1997-11-18,1845034000.0,194.0,Titanic,1997.0
7000,94000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",2003-12-01,1118889000.0,201.0,The Lord of the Rings: The Return of the King,2003.0
11008,200000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",2006-06-20,1065660000.0,151.0,Pirates of the Caribbean: Dead Man's Chest,2006.0
12481,185000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",2008-07-16,1004558000.0,152.0,The Dark Knight,2008.0
14551,237000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2009-12-10,2787965000.0,162.0,Avatar,2009.0
14892,200000000.0,"[{'id': 10751, 'name': 'Family'}, {'id': 14, '...",2010-03-03,1025491000.0,108.0,Alice in Wonderland,2010.0
15348,200000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",2010-06-16,1066970000.0,103.0,Toy Story 3,2010.0
17124,380000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",2011-05-14,1045714000.0,136.0,Pirates of the Caribbean: On Stranger Tides,2011.0
17293,195000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na...",2011-06-28,1123747000.0,154.0,Transformers: Dark of the Moon,2011.0
17437,125000000.0,"[{'id': 10751, 'name': 'Family'}, {'id': 14, '...",2011-07-07,1342000000.0,130.0,Harry Potter and the Deathly Hallows: Part 2,2011.0


In [20]:
#On créer un dataframe avec les films présentant un revenu supérieur à 1 milliard et un budget de moins de 150 million
new_df2 = filtered_df['budget'] <= 150000000
filtered_df2 = filtered_df[new_df2]
filtered_df2

Unnamed: 0,budget,genres,release_date,revenue,runtime,title,year
7000,94000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",2003-12-01,1118889000.0,201.0,The Lord of the Rings: The Return of the King,2003.0
17437,125000000.0,"[{'id': 10751, 'name': 'Family'}, {'id': 14, '...",2011-07-07,1342000000.0,130.0,Harry Potter and the Deathly Hallows: Part 2,2011.0
22110,150000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",2013-11-27,1274219000.0,102.0,Frozen,2013.0
25084,150000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2015-06-09,1513529000.0,124.0,Jurassic World,2015.0
30700,74000000.0,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",2015-06-17,1156731000.0,91.0,Minions,2015.0
36253,150000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",2016-02-11,1023784000.0,108.0,Zootopia,2016.0
44009,80000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",2017-06-15,1020063000.0,96.0,Despicable Me 3,2017.0
