In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [2]:
movies = pd.read_csv("../Dataset/movies/movies_metadata.csv", low_memory=False)
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [4]:
keywords = pd.read_csv('../Dataset/movies/keywords.csv')
keywords.head()


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


Removing movies which have malformed id

In [5]:
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan
    
print(len(movies['id']))
movies['id'] = movies['id'].apply(clean_ids)
movies = movies[movies['id'].notnull()]
print(len(movies['id']))

45466
45463


Typecasting movies['id'] col and keywords['id'] col, and merging

In [6]:
movies['id'] = movies['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')

movies = movies.merge(keywords, on='id')


In [7]:
movies["keywords"][0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [8]:
movies["keywords"] = movies["keywords"].apply(literal_eval)

In [9]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 10:
            names = names[:10]
        return names
    return []

movies['keywords'] = movies['keywords'].apply(generate_list)
movies['genres'] = movies['genres'].apply(lambda x: x[:10])

movies[['title', 'keywords', 'genres', 'id']].head()

Unnamed: 0,title,keywords,genres,id
0,Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[Animation, Comedy, Family]",862
1,Jumanji,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]",8844
2,Grumpier Old Men,"[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]",15602
3,Waiting to Exhale,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]",31357
4,Father of the Bride Part II,"[baby, midlife crisis, confidence, aging, daug...",[Comedy],11862


In [10]:
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ','')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''

for feature in ['genres', 'keywords']:
    movies[feature] = movies[feature].apply(sanitize)

Using Title, Geners Overview Keywords as features

In [11]:
def movie_soup(x):
    return  x["title"] + " " + " ".join(x['genres']) + " "+x['overview']+" "+" ".join(x['keywords'])

movies['overview'] = movies['overview'].fillna('')
movies['title'] = movies['title'].fillna('')
movies['soup'] = movies.apply(movie_soup, axis=1)

In [12]:
id = movies.loc[movies['title'] == "The Matrix"]['id']
id = int(id)
movies.loc[movies['id']==id,'soup'].values

array(['The Matrix action sciencefiction Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth. savingtheworld artificialintelligence manvsmachine philosophy prophecy martialarts selfsacrifice fight insurgence virtualreality'],
      dtype=object)

In [13]:
books = pd.read_csv("../Dataset/top2k_book_descriptions/top2k_book_descriptions.csv", index_col=0)
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,goodreads_book_id,tag_name,description
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,[],WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,3,"['books', 'bbc', 'books', 'every', 'man', 'sho...",Harry Potter's life is miserable. His parents ...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,41865,[],About three things I was absolutely positive.F...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,2657,"['mercenaries', 'mercenary', 'mercer', 'mayer'...",The unforgettable novel of a childhood in a sl...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,4671,[],Alternate Cover Edition ISBN: 0743273567 (ISBN...


In [14]:
books['tag_name'][1]

"['books', 'bbc', 'books', 'every', 'man', 'should', 'read', 'books', 'own', 'for', 'books', 'that', 'define', 'the', 'books', 'to', 'read', 'books', 'to', 'read', 'before', 'you', 'die', 'books', 'to', 'read', 'in', 'lifetime', 'books', 'to', 'read', 'my', 'version', 'bullets', 'children', 'books', 'essential', 'novels', 'essential', 'novels', 'poster', 'fiction', 'greatest', 'greatest', 'books', 'ever', 'written', 'greatest', 'novels', 'in', 'list', 'modern', 'library', 'must', 'read', 'books', 'must', 'read', 'books', 'the', 'essential', 'novels', 'everyone', 'should', 'read', 'recommended', 'year', 'war', 'years', 'war', 'book', 'before', 'kindergarten', 'book', 'goal', 'books', 'books', 'before', 'kindergarten', 'books', 'to', 'read', 'before', 'you', 'die', 'by', 'good', 'books', 'harry', 'novels', 'plus', 'pages', 'before', 'you', 'die', 'book', 'book', 'list', 'books', 'books', 'books', 'before', 'die', 'books', 'before', 'you', 'die', 'books', 'challenge', 'books', 'have', 're

In [15]:
books['tag_name'] = books['tag_name'].apply(lambda x: literal_eval(x) if literal_eval(x) else np.nan)
books = books[books['description'].notnull() | books['tag_name'].notnull()]
books = books.fillna('')

In [16]:
def book_soup(x):
    soup = x["original_title"]+" "+x["description"]+" "+" ".join(x['tag_name'])+" "+x["authors"]
    return soup

In [17]:
books["soup"] = books.apply(book_soup, axis=1)
books["soup"][0]

"The Hunger Games WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.  Suzanne Collins"

In [18]:
print(movies['soup'])

0        Toy Story animation comedy family Led by Woody...
1        Jumanji adventure fantasy family When siblings...
2        Grumpier Old Men romance comedy A family weddi...
3        Waiting to Exhale comedy drama romance Cheated...
4        Father of the Bride Part II comedy Just when G...
                               ...                        
46477    Subdue drama family Rising and falling between...
46478    Century of Birthing drama An artist struggles ...
46479    Betrayal action drama thriller When one of her...
46480    Satan Triumphant  In a small town live two bro...
46481    Queerama  50 years after decriminalisation of ...
Name: soup, Length: 46482, dtype: object


In [19]:
print(books['soup'])

0       The Hunger Games WINNING MEANS FAME AND FORTUN...
1       Harry Potter and the Philosopher's Stone Harry...
2       Twilight About three things I was absolutely p...
3       To Kill a Mockingbird The unforgettable novel ...
4       The Great Gatsby Alternate Cover Edition ISBN:...
                              ...                        
1995    Simon vs. the Homo Sapiens Agenda Sixteen-year...
1996    Perfection Zoe is used to taking care of herse...
1997    Come Away with Me An alternate cover edition c...
1998    Ms. Marvel, Vol. 1: No Normal Marvel Comics pr...
1999    A Court of Wings and Ruin Looming war threaten...
Name: soup, Length: 1987, dtype: object


In [20]:

soups = pd.concat([movies['soup'],books['soup']],ignore_index=True)
print(soups)


0        Toy Story animation comedy family Led by Woody...
1        Jumanji adventure fantasy family When siblings...
2        Grumpier Old Men romance comedy A family weddi...
3        Waiting to Exhale comedy drama romance Cheated...
4        Father of the Bride Part II comedy Just when G...
                               ...                        
48464    Simon vs. the Homo Sapiens Agenda Sixteen-year...
48465    Perfection Zoe is used to taking care of herse...
48466    Come Away with Me An alternate cover edition c...
48467    Ms. Marvel, Vol. 1: No Normal Marvel Comics pr...
48468    A Court of Wings and Ruin Looming war threaten...
Name: soup, Length: 48469, dtype: object


vectorizing the input (One HOT Vectors)

In [21]:
count = CountVectorizer(stop_words = "english")
count.fit(soups)
print(count.vocabulary)

movies_matrix = count.transform(movies['soup'])
books_matrix = count.transform(books['soup'])

books_matrix.shape, movies_matrix.shape

None


((1987, 98296), (46482, 98296))

In [22]:
# %pip install tensorflow


In [23]:
cosine_sim = cosine_similarity(movies_matrix, books_matrix)
cosine_sim.shape

(46482, 1987)

In [24]:
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['title'].apply(lambda x: x.lower() if x is not np.nan else "")).drop_duplicates()
indices

title
toy story                          0
jumanji                            1
grumpier old men                   2
waiting to exhale                  3
father of the bride part ii        4
                               ...  
subdue                         46477
century of birthing            46478
betrayal                       46479
satan triumphant               46480
queerama                       46481
Length: 46482, dtype: int64

Recommending using Movie only

In [25]:
def content_recommender(title, lim = 10):
    idx = indices[title.lower()]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    sim_scores = sim_scores[:lim]

    book_indices = [i[0] for i in sim_scores]

    return books.iloc[book_indices]

In [26]:
for i in range(10):
    print(content_recommender('The Matrix')["original_title"].iloc[i])

A Short History of Nearly Everything
Walk Two Moons
The Pillars of the Earth
Записки из подполья
Old Man's War
Native Son 
Confessions of an Ugly Stepsister
The Gift of the Magi
The War of the Worlds
Mountains Beyond Mountains: The Quest of Dr. Paul Farmer, a Man Who Would Cure the World


In [27]:
userDf = pd.read_csv('../Dataset/movies/ratings_small.csv')

def assignVal(x):
    if x >= 0 and x <= 1:
        return -2
    if x <= 2:
        return -1
    if x <= 3:
        return 1
    if x <= 4:
        return 2
    if x <= 5:
        return 3


userDf["rating"] = userDf["rating"].apply(assignVal)

Recommending book using User 

In [28]:
def getUserMovies(userID) :
    thisDf = (userDf.loc[userDf['userId'] == userID])
    
    title_rating = dict()

    for i in thisDf.index:
        id = int(thisDf['movieId'][i])
        rat = thisDf['rating'][i]
        if not movies.loc[movies['id'] == id].empty:
            # print((movies.loc[movies['id'] == id]['title'].values[0]))
            title_rating[(movies.loc[movies['id'] == id]['title'].values[0])] = rat
    
    return title_rating


getUserMovies(1)

{'Rocky III': 1,
 'Greed': -2,
 'American Pie': 2,
 'My Tutor': -1,
 'Jay and Silent Bob Strike Back': -1,
 'Confidentially Yours': 1}

In [29]:

def userRecommender(user, lim = 10):
      
      movieScore = getUserMovies(user)

      AllRecomm = dict()

      for movie, score in movieScore.items():
            curRecomm = content_recommender(movie, 50)

            for i in range(50):
                  title = curRecomm["original_title"].iloc[i]

                  if title in AllRecomm.keys():
                        AllRecomm[title] += score
                  else:
                        AllRecomm[title] = score

      AllRecomm = dict(sorted(AllRecomm.items(),key= lambda x:x[1], reverse=True))
      
      return list(AllRecomm.keys())[0:lim]
    
userRecommender(1)

['American Psycho',
 'Heidi',
 'A Separate Peace',
 'Ramona Quimby, Age 8',
 'Losing It',
 'Hogfather',
 'Rework',
 'The Pact',
 'The Magicians',
 "It's Kind of a Funny Story"]

In [30]:
import ipywidgets
from IPython.display import HTML


def show_books(movie_name='Star Wars'):
    recommendations = content_recommender(movie_name)
    for i in range(10):
          print(recommendations["original_title"].iloc[i])
display(ipywidgets.interact(show_books))

interactive(children=(Text(value='Star Wars', description='movie_name'), Output()), _dom_classes=('widget-inte…

<function __main__.show_books(movie_name='Star Wars')>

In [31]:
import ipywidgets
from IPython.display import HTML


def show_books(userID='1'):
    userID = int(userID)
    recommendations = userRecommender(userID)
    for recommendation in recommendations:
        print(recommendation)
display(ipywidgets.interact(show_books))

interactive(children=(Text(value='1', description='userID'), Output()), _dom_classes=('widget-interact',))

<function __main__.show_books(userID='1')>