In [2]:
!pip install dynetx

Collecting dynetx
  Downloading https://files.pythonhosted.org/packages/b1/bb/aee9c3e845ad0eb1209a3495f942e5ad3cd8ffb56326253d35da891f24d3/dynetx-0.2.2-py3-none-any.whl
Installing collected packages: dynetx
Successfully installed dynetx-0.2.2


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
import seaborn as sns
import numpy as np
import mpld3
import warnings
import re
from PIL import Image
from wordcloud import WordCloud
import umap
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import networkx as nx
import dynetx as dn

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
warnings.filterwarnings('ignore')
# display options for pandas dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
# pd.reset_option('display.max_colwidth')

In [0]:
movies = pd.read_csv('/content/drive/My Drive/TSEC/ml-latest/movies.csv')
ratings = pd.read_csv('/content/drive/My Drive/TSEC/ml-latest/ratings.csv')
tags = pd.read_csv('/content/drive/My Drive/TSEC/ml-latest/tags.csv')
links = pd.read_csv('/content/drive/My Drive/TSEC/ml-latest/links.csv')
gnomeT = pd.read_csv('/content/drive/My Drive/TSEC/ml-latest/genome-tags.csv')
gnomeS = pd.read_csv('/content/drive/My Drive/TSEC/ml-latest/genome-scores.csv')

In [0]:
def prepare_movies(movies):
  df = movies.copy()
  df['year'] = df.title.str.extract('(\(\d\d\d\d\))',expand=False)
  df['year'] = df.year.str.extract('(\d\d\d\d)',expand=False)
  df['title'] = df.title.str.replace('(\(\d\d\d\d\))', '')
  df['title'] = df['title'].apply(lambda x: x.strip())
  #Every genre is separated by a | so we simply have to call the split function on |
  df['genres'] = df.genres.str.split('|')
  moviesWithGenres_df = df.copy()

  #For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
  for index, row in df.iterrows():
      for genre in row['genres']:
          moviesWithGenres_df.at[index, genre] = 1
  #Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
  moviesWithGenres_df = moviesWithGenres_df.fillna(0)
  moviesWithGenres_df.head()
  return df, moviesWithGenres_df

In [0]:
def preprocess_movies(movies):
  df = movies.copy()
  df['year'] = df.title.str.extract('(\(\d\d\d\d\))',expand=False)
  df['year'] = df.year.str.extract('(\d\d\d\d)',expand=False)
  df['title'] = df.title.str.replace('(\(\d\d\d\d\))', '')
  df['title'] = df['title'].apply(lambda x: x.strip())
  #Every genre is separated by a | so we simply have to call the split function on |
  df = df.drop('genres', 1)
  return df

In [0]:
def prepare_ratings(ratings):
  df = ratings.copy()
  df = df.drop('timestamp', 1)
  return df

In [0]:
def getMovieRecommendation(x, movies, ratings, links, n=20):
  mov, moviesWithGenres = prepare_movies(movies)
  ratings = prepare_ratings(ratings)
  df = ratings.groupby('userId').get_group(x)
  inputId = mov[mov['movieId'].isin(df['movieId'].tolist())]
  inputMovies = pd.merge(inputId, df)
  inputMovies = inputMovies.drop(['genres', 'userId'], 1)
  userMovies = moviesWithGenres[moviesWithGenres['movieId'].isin(inputMovies['movieId'].tolist())]
  #userMovies
  userMovies = userMovies.reset_index(drop=True)
  userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  #userGenreTable
  userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
  #userProfile
  genreTable = moviesWithGenres.set_index(moviesWithGenres['movieId'])
  genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  #genreTable.head()
  recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
  #recommendationTable_df.head()
  recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
  #recommendationTable_df.head()
  output = mov.loc[mov['movieId'].isin(recommendationTable_df.head(n).keys())]
  return pd.merge(output, links)

In [0]:
def getFavGenre(x, movies, ratings):
  mov, moviesWithGenres = prepare_movies(movies)
  ratings = prepare_ratings(ratings)
  df = ratings.groupby('userId').get_group(x)
  inputId = mov[mov['movieId'].isin(df['movieId'].tolist())]
  inputMovies = pd.merge(inputId, df)
  inputMovies = inputMovies.drop(['genres', 'year', 'userId'], 1)
  userMovies = moviesWithGenres[moviesWithGenres['movieId'].isin(inputMovies['movieId'].tolist())]
  #userMovies
  userMovies = userMovies.reset_index(drop=True)
  userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  #userGenreTable
  userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
  pct = [x*100/sum(userProfile) for x in userProfile.to_list()]
  k = pd.Series(pct)
  k.index = userProfile.index
  k = k.sort_values(ascending=False)
  return k

In [12]:
len(movies.nunique())

3

In [0]:
def getNeighbours(x, movies, ratings, n):
  mov = preprocess_movies(movies)
  ratings = prepare_ratings(ratings)
  df = ratings.groupby('userId').get_group(x)
  inputId = mov[mov['movieId'].isin(df['movieId'].tolist())]
  inputMovies = pd.merge(inputId, df)
  inputMovies = inputMovies.drop(['userId', 'year'], 1)
  userSubset = ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]
  userSubsetGroup = userSubset.groupby(['userId'])
  userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
  userSubsetGroup = userSubsetGroup[0:100]
  pearsonCorrelationDict = {}

  for name, group in userSubsetGroup:
      #Let's start by sorting the input and current user group so the values aren't mixed up later on
      group = group.sort_values(by='movieId')
      inputMovies = inputMovies.sort_values(by='movieId')
      #Get the N for the formula
      nRatings = len(group)
      #Get the review scores for the movies that they both have in common
      temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
      #And then store them in a temporary buffer variable in a list format to facilitate future calculations
      tempRatingList = temp_df['rating'].tolist()
      #Let's also put the current user group reviews in a list format
      tempGroupList = group['rating'].tolist()
      #Now let's calculate the pearson correlation between two users, so called, x and y
      Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
      Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
      Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
      
      #If the denominator is different than zero, then divide, else, 0 correlation.
      if Sxx != 0 and Syy != 0:
          pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
      else:
          pearsonCorrelationDict[name] = 0

  pearsonCorrelationDict.items()

  pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
  pearsonDF.columns = ['similarityIndex']
  pearsonDF['userId'] = pearsonDF.index
  pearsonDF.index = range(len(pearsonDF))
  topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:n]
  return mov, ratings, topUsers.head(n)

In [0]:
def getSimilarMovies(x, movies, ratings, links, n=10):
  mov, rates, topUsers = getNeighbours(x, movies, ratings, n)  
  topUsersRating=topUsers.merge(rates, left_on='userId', right_on='userId', how='inner')
  topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
  tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
  tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
  recommendation_df = pd.DataFrame()
  recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
  recommendation_df['movieId'] = tempTopUsersRating.index
  recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
  output = mov.loc[mov['movieId'].isin(recommendation_df.head(n)['movieId'].tolist())]
  return pd.merge(output,links)

In [0]:
def top_rated(movies,rating, n):
  mov = preprocess_movies(movies)
  ratings = prepare_ratings(rating)
  ratings['rating'][ratings['rating']<4] = np.nan
  ratings.dropna(inplace=True)   
  val = ratings.groupby("movieId").count().reset_index()
  val.sort_values('rating',ascending=False, inplace=True)
  val = val.head(n)
  top = pd.merge(val,mov)
  return top

In [0]:
uid = 8
rcom_mov = getMovieRecommendation(uid, movies, ratings, links)
fav = getFavGenre(uid, movies, ratings)
neigh = getNeighbours(uid, movies, ratings, 20)[2]
neigh_mov = getSimilarMovies(uid, movies, ratings, links, n=20)

In [0]:
top = top_rated(movies, ratings, 5)

In [20]:
a = ratings[ratings['userId']==8]
pd.merge(a, links)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,8,3,3.0,858269342,113228,15602.0
1,8,5,3.0,858269342,113041,11862.0
2,8,7,4.0,858269342,114319,11860.0
3,8,12,3.0,858269481,112896,12110.0
4,8,14,3.0,858269416,113987,10858.0
5,8,52,4.0,858269372,113819,11448.0
6,8,62,4.0,858269303,113862,2054.0
7,8,100,4.0,858269481,115907,11062.0
8,8,135,3.0,858269416,116130,9101.0
9,8,140,3.0,858269446,118055,9302.0


In [18]:
#recommendations on the basis of ratings for a genre
rcom_mov

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId
0,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama, Musical, Romance]",1998,120762,10674.0
1,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Romance, Thriller]",2001,181739,12610.0
2,5018,Motorama,"[Adventure, Comedy, Crime, Drama, Fantasy, Mystery, Sci-Fi, Thriller]",1991,104922,49410.0
3,6902,Interstate 60,"[Adventure, Comedy, Drama, Fantasy, Mystery, Sci-Fi, Thriller]",2002,165832,20312.0
4,26093,"Wonderful World of the Brothers Grimm, The","[Adventure, Animation, Children, Comedy, Drama, Fantasy, Musical, Romance]",1962,56700,28367.0
5,27344,Revolutionary Girl Utena: Adolescence of Utena (a.k.a. Revolutionary Girl Utena the Movie) (Shoujo kakumei Utena: Adolescence mokushiroku),"[Action, Adventure, Animation, Comedy, Drama, Fantasy, Romance]",1999,243558,19738.0
6,75408,Lupin III: Sweet Lost Night (Rupan Sansei: Sweet Lost Night),"[Action, Animation, Comedy, Crime, Drama, Mystery, Romance, Thriller]",2008,1259781,59719.0
7,76153,Lupin III: First Contact (Rupan Sansei: Faasuto Kontakuto),"[Action, Animation, Comedy, Crime, Drama, Mystery, Romance, Thriller]",2002,827737,109572.0
8,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film-Noir, Horror, Mystery, Thriller, Western]",2010,1612774,45649.0
9,83266,Kaho Naa... Pyaar Hai,"[Action, Adventure, Comedy, Drama, Mystery, Romance, Thriller]",2000,234000,16987.0


In [0]:
#favourite genre of the user
fav

In [0]:
#recommendations on the basis of what other users have watched
neigh_mov

In [0]:
#users similar to you
neigh

In [0]:
#top rated movies
top.to_json('top.json')

In [0]:
mG = nx.Graph()
a = getNeighbours(1, movies, ratings, 15) 
ru = a[2]
ru['similarityIndex'] = 1 - ru['similarityIndex']
edges = [(i,y) for y in ru['userId'].to_list()]
G.add_nodes_from(ru['userId'].to_list())
G.add_edges_from(edges)
neigh = ru['userId'].to_list()[1:]
for i in range(len(neigh)):
  a = getNeighbours(neigh[i], movies, ratings, 15) 
  ru = a[2]
  ru['similarityIndex'] = 1 - ru['similarityIndex']
  # nodes
  edges = [(i,y) for y in ru['userId'].to_list()]
  G.add_nodes_from(ru['userId'].to_list())
  G.add_edges_from(edges)

In [0]:
plt.figure(figsize = (30,15))

pos = nx.spring_layout(G,k=0.15,iterations=20)
nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_size=500, alpha=0.8)

nx.draw_networkx_edges(G,pos,width=1.0,alpha=0.5)

nx.draw_networkx_edges(G,pos, edgelist=G.edges(), width=8,alpha=0.5)
labels=nx.draw_networkx_labels(G,pos=nx.spring_layout(G))
plt.show()

In [0]:
g = dn.DynGraph(edge_removal=True)
g.add_interactions_from(G.edges(), t=2)