# Projet

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle as pkl
import re

In [2]:
# Importation de la librairie iads
import iads as iads

# importation de LabeledSet
from iads import LabeledSet as ls

# importation de Classifiers
from iads import Classifiers as cl

# importation de utils
from iads import utils as ut

## Import des bases de données CSV

In [3]:
path = "data/"

movies_pd = pd.read_csv(path + "movies.csv", index_col = 0)
movies_pd.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
genres = ["Action", "Adventure", "Animation", "Children", "Comedy", \
          "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", \
          "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", \
          "Western", "(no genres listed)"]

for g in genres:
    movies_pd[g.lower()] = movies_pd.apply(lambda r: 2*int(g in r["genres"]) - 1, axis=1)

In [5]:
movies_pd.head()

Unnamed: 0_level_0,title,genres,action,adventure,animation,children,comedy,crime,documentary,drama,...,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,-1,1,1,1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,Jumanji (1995),Adventure|Children|Fantasy,-1,1,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,Grumpier Old Men (1995),Comedy|Romance,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,-1,-1,-1,-1
4,Waiting to Exhale (1995),Comedy|Drama|Romance,-1,-1,-1,-1,1,-1,-1,1,...,-1,-1,-1,-1,1,-1,-1,-1,-1,-1
5,Father of the Bride Part II (1995),Comedy,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [8]:
def get_year(r):
    match = re.match("(.*)\s*\((\d+)\)", r["title"])
    if match:
        return match.group(2)
    
def get_title_without_year(r):
    match = re.match("(.*)\s*\((\d+)\)", r["title"])
    if match:
        return match.group(1)
    else:
        return r["title"]

movies_pd["Year"] = movies_pd.apply(get_year, axis = 1)
movies_pd["title"] = movies_pd.apply(get_title_without_year, axis = 1)

In [9]:
movies_pd.head()

Unnamed: 0_level_0,title,genres,action,adventure,animation,children,comedy,crime,documentary,drama,...,horror,musical,mystery,romance,sci-fi,thriller,war,western,(no genres listed),Year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,-1,1,1,1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,
2,Jumanji,Adventure|Children|Fantasy,-1,1,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,
3,Grumpier Old Men,Comedy|Romance,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,
4,Waiting to Exhale,Comedy|Drama|Romance,-1,-1,-1,-1,1,-1,-1,1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,
5,Father of the Bride Part II,Comedy,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,


In [10]:
ratings_pd = pd.read_csv(path + "ratings.csv")
ratings_pd.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [11]:
movies_pd["Average rating"] = ratings_pd.groupby("movieId")["rating"].mean()
movies_pd["Number of ratings"] = ratings_pd.groupby("movieId")["rating"].count()

movies_pd["Average rating"].fillna(0, inplace = True)
movies_pd["Number of ratings"].fillna(0, inplace = True)

In [12]:
movies_pd.head()

Unnamed: 0_level_0,title,genres,action,adventure,animation,children,comedy,crime,documentary,drama,...,mystery,romance,sci-fi,thriller,war,western,(no genres listed),Year,Average rating,Number of ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,-1,1,1,1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,,3.92124,49695.0
2,Jumanji,Adventure|Children|Fantasy,-1,1,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,,3.211977,22243.0
3,Grumpier Old Men,Comedy|Romance,-1,-1,-1,-1,1,-1,-1,-1,...,-1,1,-1,-1,-1,-1,-1,,3.15104,12735.0
4,Waiting to Exhale,Comedy|Drama|Romance,-1,-1,-1,-1,1,-1,-1,1,...,-1,1,-1,-1,-1,-1,-1,,2.861393,2756.0
5,Father of the Bride Part II,Comedy,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,,3.064592,12161.0


In [13]:
tags_pd = pd.read_csv(path + "tags.csv", encoding="utf-8")

In [14]:
print(tags_pd.shape)
tags_pd.head()

(465564, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [15]:
links_pd = pd.read_csv(path + "links.csv", encoding="utf-8", keep_default_na = False)
links_pd = links_pd.loc[links_pd["tmdbId"]!=""]
links_pd["tmdbId"] = pd.to_numeric(links_pd["tmdbId"])

In [16]:
links_pd.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


## Import des bases de données PKL

In [17]:
with open(path + "act_v2.pkl", "rb") as file_act :
    acteurs_pkl = pkl.load(file_act)
#acteurs_pkl contient une list * list * dict
#acteurs_pkl[i][j] est un dictionnaire avec des informations sur le j-ème acteur du i-ème film de la base act_v2.pkl

In [18]:
print(type(acteurs_pkl))
print(type(acteurs_pkl[0]))
print(type(acteurs_pkl[0][0]))
print(len(acteurs_pkl[0]))
print(acteurs_pkl[0][0])

<class 'list'>
<class 'list'>
<class 'dict'>
39
{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/xxPMucou2wRDxLrud8i2D4dsywh.jpg'}


In [19]:
with open(path + "film_v2.pkl", "rb") as file_film :
    films_pkl = pkl.load(file_film)
#films_pkl contient une list * dict
#films_pkl[i] est un dictionnaire avec des informations sur le i-ème film de la base act_v2.pkl

In [20]:
print(type(films_pkl))
print(type(films_pkl[0]))
print(len(films_pkl))
print(films_pkl[0])

<class 'list'>
<class 'dict'>
26908
{'adult': False, 'backdrop_path': '/dji4Fm0gCDVb9DQQMRvAI8YNnTz.jpg', 'genre_ids': [16, 35, 10751], 'id': 862, 'original_language': 'en', 'original_title': 'Toy Story', 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.", 'poster_path': '/rhIRbceoE9lR4veEXuwCC2wARtG.jpg', 'release_date': '1995-10-30', 'title': 'Toy Story', 'video': False, 'vote_average': 7.9, 'vote_count': 9550, 'popularity': 22.773}


In [21]:
with open(path + "crew_v2.pkl", "rb") as file_crew :
    crew_pkl = pkl.load(file_crew)

In [22]:
print(type(crew_pkl))
print(type(crew_pkl[0]))
print(type(crew_pkl[0][0]))
print(len(crew_pkl))
print(crew_pkl[0][0])

<class 'list'>
<class 'list'>
<class 'dict'>
26908
{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', 'gender': 2, 'id': 7879, 'job': 'Director', 'name': 'John Lasseter', 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}


## Transformation des bases PKL en CSV

In [21]:
with open(path + "generated/actors.csv", "w", encoding="utf-8") as f_actors:
    with open(path + "generated/actorsFilms.csv", "w", encoding="utf-8") as f_actorFilms:
        f_actors.write("idActor;name;gender\n")
        f_actorFilms.write("idActor;idFilmPkl;order;character\n")
        actorsAdded = set()
        for i in range(len(acteurs_pkl)):
            for actor in acteurs_pkl[i]:
                if actor["id"] not in actorsAdded:
                    actorsAdded.add(actor["id"])
                    f_actors.write("{:d};{:s};{:d}\n".format(actor["id"], actor["name"], actor["gender"]))
                f_actorFilms.write("{:d};{:d};{:d};{:s}\n".format(actor["id"], i, actor["order"], actor["character"].replace(";",",")))

In [22]:
with open(path + "generated/films.csv", "w", encoding="utf-8") as f_films:
    f_films.write("idFilmPkl;tmdbId;adult;original_language;original_title;release_date;title;vote_average;vote_count;popularity\n")
    for i in range(len(films_pkl)):
        f = films_pkl[i]
        f_films.write("{:d};{:d};{:d};{:s};{:s};{:s};{:s};{:f};{:d};{:f}\n"\
                      .format(i, f["id"], f["adult"], f["original_language"],\
                              f["original_title"] if ";" not in f["original_title"] else  "\"" + f["original_title"] + "\"",\
                              "" if "release_date" not in f else f["release_date"],\
                              f["title"] if ";" not in f["title"] else  "\"" + f["title"] + "\"",\
                              f["vote_average"],f["vote_count"],\
                              0 if "popularity" not in f else f["popularity"]))

In [23]:
with open(path + "generated/crew.csv", "w", encoding="utf-8") as f_crew:
    with open(path + "generated/crewFilms.csv", "w", encoding="utf-8") as f_crewFilms:
        f_crew.write("idCrew;name;gender\n")
        f_crewFilms.write("idCrew;idFilmPkl;department;job\n")
        crewAdded = set()
        for i in range(len(crew_pkl)):
            for cr in crew_pkl[i]:
                if cr["id"] not in crewAdded:
                    crewAdded.add(cr["id"])
                    f_crew.write("{:d};{:s};{:d}\n".format(cr["id"], cr["name"], cr["gender"]))
                f_crewFilms.write("{:d};{:d};{:s};{:s}\n".format(cr["id"], i, cr["department"], cr["job"]))

## Import des bases CSV obtenus à partir des PKL

In [23]:
actors_pd = pd.read_csv(path + "generated/actors.csv", sep=";", index_col="idActor")
actorsFilms_pd = pd.read_csv(path + "generated/actorsFilms.csv", sep=";")

In [24]:
actors_pd.head()

Unnamed: 0_level_0,name,gender
idActor,Unnamed: 1_level_1,Unnamed: 2_level_1
31,Tom Hanks,2
12898,Tim Allen,2
7167,Don Rickles,2
12899,Jim Varney,2
12900,Wallace Shawn,2


In [25]:
actorsFilms_pd.head()

Unnamed: 0,idActor,idFilmPkl,order,character
0,31,0,0,Woody (voice)
1,12898,0,1,Buzz Lightyear (voice)
2,7167,0,2,Mr. Potato Head (voice)
3,12899,0,3,Slinky Dog (voice)
4,12900,0,4,Rex (voice)


In [26]:
actors_pd["nbRoles"] = actorsFilms_pd.groupby('idActor').count()["idFilmPkl"]

In [27]:
actors_pd.sort_values("nbRoles", axis=0, ascending=False).head(10)

Unnamed: 0_level_0,name,gender,nbRoles
idActor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
121323,Bess Flowers,1,215
15831,Frank Welker,2,125
4165,John Wayne,2,114
2231,Samuel L. Jackson,2,104
113,Christopher Lee,2,104
884,Steve Buscemi,2,93
380,Robert De Niro,2,92
3895,Michael Caine,2,90
55636,Donald Sutherland,2,87
8516,John Carradine,2,86


In [28]:
films_pd = pd.read_csv(path + "generated/films.csv", sep=";", index_col="idFilmPkl")
films_pd["idFilmPkl"] = films_pd.index

In [29]:
films_pd.head()

Unnamed: 0_level_0,tmdbId,adult,original_language,original_title,release_date,title,vote_average,vote_count,popularity,idFilmPkl
idFilmPkl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,862,0,en,Toy Story,1995-10-30,Toy Story,7.9,9550,22.773,0
1,8844,0,en,Jumanji,1995-12-15,Jumanji,7.1,5594,2.947,1
2,15602,0,en,Grumpier Old Men,1995-12-22,Grumpier Old Men,6.5,140,6.076,2
3,31357,0,en,Waiting to Exhale,1995-12-22,Waiting to Exhale,6.1,55,2.917,3
4,11862,0,en,Father of the Bride Part II,1995-02-10,Father of the Bride Part II,6.1,288,6.817,4


In [30]:
films_pd.groupby("original_language").count().head()

Unnamed: 0_level_0,tmdbId,adult,original_title,release_date,title,vote_average,vote_count,popularity,idFilmPkl
original_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
aa,1,1,1,1,1,1,1,1,1
ab,4,4,4,4,4,4,4,4,4
af,1,1,1,1,1,1,1,1,1
am,1,1,1,1,1,1,1,1,1
ar,21,21,21,21,21,21,21,21,21


In [31]:
crew_pd = pd.read_csv(path + "generated/crew.csv", sep=";", index_col="idCrew")
crewFilms_pd = pd.read_csv(path + "generated/crewFilms.csv", sep=";")

In [32]:
crew_pd.head()

Unnamed: 0_level_0,name,gender
idCrew,Unnamed: 1_level_1,Unnamed: 2_level_1
7879,John Lasseter,2
12891,Joss Whedon,2
7,Andrew Stanton,2
12892,Joel Cohen,2
12893,Alec Sokolow,0


In [33]:
crewFilms_pd.head()

Unnamed: 0,idCrew,idFilmPkl,department,job
0,7879,0,Directing,Director
1,12891,0,Writing,Screenplay
2,7,0,Writing,Screenplay
3,12892,0,Writing,Screenplay
4,12893,0,Writing,Screenplay


In [34]:
crew_pd["nbFilms"] = crewFilms_pd.groupby('idCrew').count()["idFilmPkl"]

In [35]:
crew_pd.sort_values("nbFilms", axis=0, ascending=False).head(10)

Unnamed: 0_level_0,name,gender,nbFilms
idCrew,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9062,Cedric Gibbons,2,282
59839,Harvey Weinstein,2,234
1307,Bob Weinstein,2,226
4350,Edith Head,1,174
2057972,Dan Perri,0,172
2952,Avy Kaufman,1,151
9063,Edwin B. Willis,2,142
1760,Jerry Goldsmith,2,140
5328,Kerry Barden,2,134
3192,Billy Hopkins,2,128


In [36]:
new_links_pd = links_pd.merge(films_pd[["idFilmPkl", "tmdbId"]], how="inner", on="tmdbId")
new_links_pd.head()

Unnamed: 0,movieId,imdbId,tmdbId,idFilmPkl
0,1,114709,862,0
1,2,113497,8844,1
2,3,113228,15602,2
3,4,114885,31357,3
4,5,113041,11862,4


In [37]:
filmes_pd = new_links_pd.merge(films_pd, how="inner", on="tmdbId")
filmes_pd = filmes_pd.merge(movies_pd, how="inner", on="movieId")
filmes_pd.drop(["imdbId", "idFilmPkl_y", "genres"], axis=1, inplace=True)
filmes_pd.rename({"idFilmPkl_x": "idFilmPkl",\
                  "original_language": "originalLanguage",\
                  "original_title": "originalTitle",\
                  "release_date": "releaseDate",\
                  "title_x": "titleTmdb",\
                  "vote_average": "ratingTmdb",\
                  "vote_count": "nbRatingTmdb",\
                  "title_y": "titleMovieLens",\
                  "Year": "year",\
                  "Average rating": "ratingMovieLens",\
                  "Number of ratings": "nbRatingMovieLens"}, axis=1, inplace=True)
print(filmes_pd.columns)
filmes_pd.head()

Index(['movieId', 'tmdbId', 'idFilmPkl', 'adult', 'originalLanguage',
       'originalTitle', 'releaseDate', 'titleTmdb', 'ratingTmdb',
       'nbRatingTmdb', 'popularity', 'titleMovieLens', 'action', 'adventure',
       'animation', 'children', 'comedy', 'crime', 'documentary', 'drama',
       'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance',
       'sci-fi', 'thriller', 'war', 'western', '(no genres listed)', 'year',
       'ratingMovieLens', 'nbRatingMovieLens'],
      dtype='object')


Unnamed: 0,movieId,tmdbId,idFilmPkl,adult,originalLanguage,originalTitle,releaseDate,titleTmdb,ratingTmdb,nbRatingTmdb,...,mystery,romance,sci-fi,thriller,war,western,(no genres listed),year,ratingMovieLens,nbRatingMovieLens
0,1,862,0,0,en,Toy Story,1995-10-30,Toy Story,7.9,9550,...,-1,-1,-1,-1,-1,-1,-1,,3.92124,49695.0
1,2,8844,1,0,en,Jumanji,1995-12-15,Jumanji,7.1,5594,...,-1,-1,-1,-1,-1,-1,-1,,3.211977,22243.0
2,3,15602,2,0,en,Grumpier Old Men,1995-12-22,Grumpier Old Men,6.5,140,...,-1,1,-1,-1,-1,-1,-1,,3.15104,12735.0
3,4,31357,3,0,en,Waiting to Exhale,1995-12-22,Waiting to Exhale,6.1,55,...,-1,1,-1,-1,-1,-1,-1,,2.861393,2756.0
4,5,11862,4,0,en,Father of the Bride Part II,1995-02-10,Father of the Bride Part II,6.1,288,...,-1,-1,-1,-1,-1,-1,-1,,3.064592,12161.0


In [38]:
nbActors = actorsFilms_pd[["idFilmPkl", "order"]].groupby("idFilmPkl").count().rename({"order": "nbActors"}, axis=1)
filmes_pd = filmes_pd.merge(nbActors, how="inner", on="idFilmPkl")
nbCrew = crewFilms_pd[["idFilmPkl", "job"]].groupby("idFilmPkl").count().rename({"job": "nbCrew"}, axis=1)
filmes_pd = filmes_pd.merge(nbCrew, how="inner", on="idFilmPkl")

In [39]:
filmes_pd

Unnamed: 0,movieId,tmdbId,idFilmPkl,adult,originalLanguage,originalTitle,releaseDate,titleTmdb,ratingTmdb,nbRatingTmdb,...,sci-fi,thriller,war,western,(no genres listed),year,ratingMovieLens,nbRatingMovieLens,nbActors,nbCrew
0,1,862,0,0,en,Toy Story,1995-10-30,Toy Story,7.9,9550,...,-1,-1,-1,-1,-1,,3.921240,49695.0,39,195
1,2,8844,1,0,en,Jumanji,1995-12-15,Jumanji,7.1,5594,...,-1,-1,-1,-1,-1,,3.211977,22243.0,32,94
2,3,15602,2,0,en,Grumpier Old Men,1995-12-22,Grumpier Old Men,6.5,140,...,-1,-1,-1,-1,-1,,3.151040,12735.0,7,50
3,4,31357,3,0,en,Waiting to Exhale,1995-12-22,Waiting to Exhale,6.1,55,...,-1,-1,-1,-1,-1,,2.861393,2756.0,14,14
4,5,11862,4,0,en,Father of the Bride Part II,1995-02-10,Father of the Bride Part II,6.1,288,...,-1,-1,-1,-1,-1,,3.064592,12161.0,12,7
5,6,949,5,0,en,Heat,1995-12-15,Heat,7.8,3002,...,-1,1,-1,-1,-1,,3.834930,23899.0,65,73
6,7,11860,6,0,en,Sabrina,1995-12-15,Sabrina,6.1,260,...,-1,-1,-1,-1,-1,,3.366484,12961.0,57,53
7,8,45325,7,0,en,Tom and Huck,1995-12-22,Tom and Huck,5.3,73,...,-1,-1,-1,-1,-1,,3.142049,1415.0,7,4
8,9,9091,8,0,en,Sudden Death,1995-10-27,Sudden Death,5.7,279,...,-1,-1,-1,-1,-1,,3.004924,3960.0,6,90
9,10,710,9,0,en,GoldenEye,1995-11-16,GoldenEye,6.8,1853,...,-1,1,-1,-1,-1,,3.430029,29005.0,21,46


In [40]:
actorsFilms_pd.head()

Unnamed: 0,idActor,idFilmPkl,order,character
0,31,0,0,Woody (voice)
1,12898,0,1,Buzz Lightyear (voice)
2,7167,0,2,Mr. Potato Head (voice)
3,12899,0,3,Slinky Dog (voice)
4,12900,0,4,Rex (voice)


In [41]:
filmes_pd["nbRatingMovieLens"].dtype

dtype('float64')

In [42]:
actors_pd.sort_values("nbRoles", ascending=False, inplace = True)
nbActors = actors_pd.shape[0]
rate = 0.05
nbFamousActors = int(rate*nbActors)
famous_actors_pd = actors_pd.iloc[:nbFamousActors]
idFamousActors = set(famous_actors_pd.index)

In [43]:
nbFamousActorsInFilm = np.zeros(len(acteurs_pkl), dtype=int)

for i in range(nbFamousActorsInFilm.size):
    film = acteurs_pkl[i]
    for actor in film:
        if actor["id"] in idFamousActors:
            nbFamousActorsInFilm[i] += 1
            
famousActorsInFilm_pd = pd.DataFrame(data={"idFilmPkl": np.arange(nbFamousActorsInFilm.size), "nbFamousActors": nbFamousActorsInFilm})

In [44]:
filmes_pd = filmes_pd.merge(famousActorsInFilm_pd, how="inner", on="idFilmPkl")

In [45]:
filmes_pd.head()

Unnamed: 0,movieId,tmdbId,idFilmPkl,adult,originalLanguage,originalTitle,releaseDate,titleTmdb,ratingTmdb,nbRatingTmdb,...,thriller,war,western,(no genres listed),year,ratingMovieLens,nbRatingMovieLens,nbActors,nbCrew,nbFamousActors
0,1,862,0,0,en,Toy Story,1995-10-30,Toy Story,7.9,9550,...,-1,-1,-1,-1,,3.92124,49695.0,39,195,21
1,2,8844,1,0,en,Jumanji,1995-12-15,Jumanji,7.1,5594,...,-1,-1,-1,-1,,3.211977,22243.0,32,94,10
2,3,15602,2,0,en,Grumpier Old Men,1995-12-22,Grumpier Old Men,6.5,140,...,-1,-1,-1,-1,,3.15104,12735.0,7,50,7
3,4,31357,3,0,en,Waiting to Exhale,1995-12-22,Waiting to Exhale,6.1,55,...,-1,-1,-1,-1,,2.861393,2756.0,14,14,11
4,5,11862,4,0,en,Father of the Bride Part II,1995-02-10,Father of the Bride Part II,6.1,288,...,-1,-1,-1,-1,,3.064592,12161.0,12,7,12


In [46]:
crew_pd.sort_values("nbFilms", ascending=False, inplace = True)
nbCrew = crew_pd.shape[0]
rate = 0.05
nbFamousCrew = int(rate*nbCrew)
famous_crew_pd = crew_pd.iloc[:nbFamousCrew]
idFamousCrew = set(famous_crew_pd.index)

In [47]:
nbFamousCrewInFilm = np.zeros(len(crew_pkl), dtype=int)

for i in range(nbFamousCrewInFilm.size):
    film = crew_pkl[i]
    for actor in film:
        if actor["id"] in idFamousCrew:
            nbFamousCrewInFilm[i] += 1
            
famousCrewInFilm_pd = pd.DataFrame(data={"idFilmPkl": np.arange(nbFamousCrewInFilm.size), "nbFamousCrew": nbFamousCrewInFilm})

In [48]:
filmes_pd = filmes_pd.merge(famousCrewInFilm_pd, how="inner", on="idFilmPkl")

In [49]:
filmes_pd.head()

Unnamed: 0,movieId,tmdbId,idFilmPkl,adult,originalLanguage,originalTitle,releaseDate,titleTmdb,ratingTmdb,nbRatingTmdb,...,war,western,(no genres listed),year,ratingMovieLens,nbRatingMovieLens,nbActors,nbCrew,nbFamousActors,nbFamousCrew
0,1,862,0,0,en,Toy Story,1995-10-30,Toy Story,7.9,9550,...,-1,-1,-1,,3.92124,49695.0,39,195,21,40
1,2,8844,1,0,en,Jumanji,1995-12-15,Jumanji,7.1,5594,...,-1,-1,-1,,3.211977,22243.0,32,94,10,26
2,3,15602,2,0,en,Grumpier Old Men,1995-12-22,Grumpier Old Men,6.5,140,...,-1,-1,-1,,3.15104,12735.0,7,50,7,29
3,4,31357,3,0,en,Waiting to Exhale,1995-12-22,Waiting to Exhale,6.1,55,...,-1,-1,-1,,2.861393,2756.0,14,14,11,8
4,5,11862,4,0,en,Father of the Bride Part II,1995-02-10,Father of the Bride Part II,6.1,288,...,-1,-1,-1,,3.064592,12161.0,12,7,12,6


## tentativa 2

In [50]:
ls_note4 = ls.LabeledSet(4)
ls_note4.x = np.empty((filmes_pd.shape[0], 4))
ls_note4.y = np.empty((filmes_pd.shape[0], 1))
ls_note4.nb_examples = filmes_pd.shape[0]

ls_note4.x[:,0] = filmes_pd["nbFamousActors"]/filmes_pd["nbActors"]
ls_note4.x[:,1] = filmes_pd["nbFamousCrew"]/filmes_pd["nbCrew"]
ls_note4.x[:,2] = filmes_pd["nbRatingMovieLens"]
ls_note4.x[:,3] = filmes_pd["nbRatingTmdb"]

ls_note4.x = (ls_note4.x - ls_note4.x.mean(0)) / ls_note4.x.std(0)

ls_note4.y[:,0] = (filmes_pd["ratingMovieLens"] >= 4) * 2 - 1

In [51]:
ls_note4_learn, ls_note4_test = ut.split(ls_note4, p_train = 0.85)

In [122]:
cl_note_sto = cl.ClassifierGradientSto(4, 1e-3)

In [123]:
N = 400

for i in range(N):
    cl_note_sto.train(ls_note4_learn)

In [124]:
cl_note_sto.accuracy(ls_note4_test)

52.22746331236897

In [125]:
cl_note_sto.accuracy(ls_note4_learn)

51.14138606643277

In [126]:
(ls_note4.y==1).sum()/ls_note4.y.size

0.05791251018903078

In [52]:
kernel = ut.KernelPolyMultiD()
cl_note_stoker = cl.ClassifierGradientStoKernel(15, 1e-5, kernel)

In [54]:
N = 400

for i in range(N):
    cl_note_stoker.train(ls_note4_learn)

In [55]:
cl_note_stoker.accuracy(ls_note4_test)

94.7087255151361

In [56]:
cl_note_stoker.accuracy(ls_note4_learn)

94.17368999633565