In [91]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import regex as re
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from ipywidgets import interact

In [2]:
Mdata = pd.read_csv("movies.csv")
Mdata.shape

(9125, 3)

In [3]:
Mdata.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rData = pd.read_csv("ratings.csv")
rData.shape

(100836, 4)

In [5]:
rData.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
data = pd.merge(Mdata,rData,on='movieId',how='inner')
data.shape

(96811, 6)

In [7]:
data.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
data = data.drop(['movieId','userId','timestamp'],axis=1)
data.head()

Unnamed: 0,title,genres,rating
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
2,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.5
3,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.5
4,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.5


In [10]:
data = pd.pivot_table(data,index=['title','genres'],aggfunc='mean')
data.reset_index(level=['title','genres'],inplace=True)
data.head(10)

Unnamed: 0,title,genres,rating
0,'Hellboy': The Seeds of Creation (2004),Action|Adventure|Comedy|Documentary|Fantasy,4.0
1,'Round Midnight (1986),Drama|Musical,3.5
2,'Salem's Lot (2004),Drama|Horror|Mystery|Thriller,5.0
3,'Til There Was You (1997),Drama|Romance,4.0
4,"'burbs, The (1989)",Comedy,3.176471
5,'night Mother (1986),Drama,3.0
6,(500) Days of Summer (2009),Comedy|Drama|Romance,3.666667
7,*batteries not included (1987),Children|Comedy|Fantasy|Sci-Fi,3.285714
8,...And Justice for All (1979),Drama|Thriller,3.166667
9,1-900 (06) (1994),Drama|Romance,4.0


In [49]:
def get_year(text:str):
    try:
        title = text.split(" ")
        year = title[-1]
        year = re.sub("[^0-9]", "",year)
        year = year.strip(" ")
        return int(year)       
    except:
        return 0

In [50]:
data['year'] = data['title'].apply(get_year)


In [51]:
data.head()

Unnamed: 0,title,genres,rating,year
0,'Hellboy': The Seeds of Creation (2004),Action|Adventure|Comedy|Documentary|Fantasy,4.0,2004
1,'Round Midnight (1986),Drama|Musical,3.5,1986
2,'Salem's Lot (2004),Drama|Horror|Mystery|Thriller,5.0,2004
3,'Til There Was You (1997),Drama|Romance,4.0,1997
4,"'burbs, The (1989)",Comedy,3.176471,1989


In [55]:
data['year'].unique()

array([2004, 1986, 1997, 1989, 2009, 1987, 1979, 1994, 2016, 2006, 1999,
       2011, 2008, 2000, 1996, 1961, 2002, 2003, 1957, 2013, 2010, 1960,
       2005, 2007, 2001, 1988, 1984, 2012, 1998, 1916, 1954, 1968, 1977,
       2014, 1985, 1992, 1995, 1991, 1935, 1959, 1933, 1982, 1953, 2015,
       1958, 1963,    0, 1948, 1971, 1981, 1951, 1949, 1993, 1990, 1938,
       1936, 1965, 1972, 1980, 1970, 1974, 1966, 1950, 1930, 1976, 1983,
       1973, 1964, 1956, 1945, 1929, 1969, 1978, 1939, 1975, 1944, 1934,
       1937, 1947, 1955, 1952, 1942, 1940, 1967, 1925, 1946, 1962, 1915,
       1920, 1931, 1928, 1941, 1927, 1943, 1932, 1926, 1924, 1917, 1921,
       1922, 1923, 1902], dtype=int64)

In [59]:
genres = data['genres'].str.split("|")
te = TransactionEncoder()
genres = te.fit_transform(genres)
genres = pd.DataFrame(genres,columns=te.columns_)

In [60]:
genres

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,False,True,True,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False
3,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7067,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False
7068,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False
7069,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False
7070,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [61]:
genres = genres.astype("int")
genres.insert(0,'title',data['title'])
genres.head(2)

Unnamed: 0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,'Hellboy': The Seeds of Creation (2004),0,1,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,'Round Midnight (1986),0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [64]:
genres = genres.transpose()
genres = genres.rename(columns=genres.iloc[0])
genres = genres.drop(genres.index[0])
genres = genres.astype(int)

In [67]:
genres.head(2)

Unnamed: 0,'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),...,Zoom (2006),Zootopia (2016),Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
(no genres listed),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Action,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,1,1,1,0,0


In [96]:
@interact
def recomendations(movie = list(genres.columns)):
    g = genres[movie]
    similar_movies = genres.corrwith(g)
    similar_movies = similar_movies.sort_values(ascending=False)
    similar_movies = similar_movies.iloc[1:11]
    return similar_movies
    

interactive(children=(Dropdown(description='movie', options=("'Hellboy': The Seeds of Creation (2004)", "'Roun…