# Importing Modules

In [1]:
import numpy as np
import pandas as pd
import mlxtend
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from copy import deepcopy

# Importing Datasets

In [2]:
df_links = pd.read_csv('links.csv')
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')
df_tags = pd.read_csv('tags.csv')

# Filling Missing Values

In [3]:
df_links.fillna(0,inplace=True)

# Convert to lower case

In [4]:
for i in range(len(df_movies['title'])):
  df_movies['title'][i] = df_movies['title'][i].lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
temp = deepcopy(df_tags)
df_genre = df_movies.merge(temp, on='movieId', how = 'inner')
df_genre = df_genre.groupby(by = ['userId'])["genres"].apply(list).reset_index()
df_genre.head()

Unnamed: 0,userId,genres
0,2,"[Comedy, Comedy, Comedy, Drama, Drama, Drama, ..."
1,7,[Crime|Drama|Thriller]
2,18,"[Crime|Drama, Crime|Drama, Crime|Drama, Crime|..."
3,21,"[Comedy|Romance, Comedy|Romance, Drama, Action..."
4,49,"[Sci-Fi|IMAX, Sci-Fi|IMAX, Sci-Fi|IMAX]"


In [6]:
df_genre_list = []
for i in range(len(df_genre['genres'])):
  temp = []
  for j in df_genre['genres'][i]:
    new = list(map(str,j.split('|')))
    for newj in new: temp.append(newj)
  # df_genre['genres'][i] = temp
  temp = list(set(temp))
  df_genre_list.append(temp)
  df_genre['genres'][i] = temp

df_genre_list[0:3]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


[['Comedy', 'Drama', 'Crime'],
 ['Thriller', 'Crime', 'Drama'],
 ['Thriller', 'Drama', 'Mystery', 'Documentary', 'Crime', 'War']]

In [7]:
df_genre.head()

Unnamed: 0,userId,genres
0,2,"[Comedy, Drama, Crime]"
1,7,"[Thriller, Crime, Drama]"
2,18,"[Thriller, Drama, Mystery, Documentary, Crime,..."
3,21,"[Action, Comedy, Romance, Drama]"
4,49,"[Sci-Fi, IMAX]"


In [8]:
# df_tagss = df_movies.merge(deepcopy(df_tags), on='movieId', how = 'inner')
# df_tagss = df_tags.groupby(by = ['userId'])["tag"].apply(list).reset_index()
# df_tagss = df_tags.groupby(by = ['userId'])["tag"]
# df_tagss.head()

user_to_movies = {}
for i in range(len(df_ratings)):
  if(df_ratings['userId'][i] in user_to_movies): user_to_movies[df_ratings['userId'][i]].append(df_ratings['movieId'][i])
  else: user_to_movies[df_ratings['userId'][i]] = [df_ratings['movieId'][i]]

movies_to_tags = {}
for i in range(len(df_tags)):
  if(df_tags['movieId'][i] in movies_to_tags): movies_to_tags[df_tags['movieId'][i]].append(df_tags['tag'][i])
  else: movies_to_tags[df_tags['movieId'][i]] = [df_tags['tag'][i]]

user_to_tag = {}

for i in user_to_movies:
  temp = []
  for j in user_to_movies[i]:
    if(j not in movies_to_tags): continue
    temp += movies_to_tags[j]
  if(temp == []): continue
  temp = list(set(temp))
  user_to_tag[i] = temp

title_to_id = {}
for i in range(len(df_movies)):
  title_to_id[df_movies['title'][i]] = df_movies['movieId'][i]

for i in movies_to_tags:
  for j in range(len(movies_to_tags[i])): movies_to_tags[i][j] = movies_to_tags[i][j].lower()

In [9]:
df_tags_list = []
for i in user_to_tag:
  df_tags_list.append(user_to_tag[i])
# for i in range(len(df_tags_list)):
#   df_tags_list[i] = list(set(df_tags_list[i]))
df_tags_list[0:3]
print(len(df_tags_list))

288


# Making Rules For Genre and Tags

In [10]:
te = TransactionEncoder()
te_ary = te.fit(df_genre_list).transform(df_genre_list)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [11]:
apriori_frequent_itemsets = apriori(df, min_support=0.2,use_colnames=True,max_len=10)

In [12]:
rules_genre = association_rules(apriori_frequent_itemsets,metric="lift",min_threshold=1)
rules_genre = rules_genre.sort_values(by=['lift'], ascending=False)

In [13]:
rules_genre

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2623,"(Adventure, Mystery)","(Fantasy, Action, Drama)",0.241379,0.206897,0.206897,0.857143,4.142857,0.156956,5.551724
4898,"(Adventure, Mystery)","(Thriller, Action, Drama, Fantasy)",0.241379,0.206897,0.206897,0.857143,4.142857,0.156956,5.551724
5867,"(Adventure, Drama, Mystery)","(Thriller, Comedy, Action, Romance)",0.241379,0.206897,0.206897,0.857143,4.142857,0.156956,5.551724
4861,"(Adventure, Action, Drama, Mystery)","(Thriller, Fantasy)",0.241379,0.206897,0.206897,0.857143,4.142857,0.156956,5.551724
3812,"(Adventure, Drama, Mystery)","(Thriller, Fantasy)",0.241379,0.206897,0.206897,0.857143,4.142857,0.156956,5.551724
...,...,...,...,...,...,...,...,...,...
57,(Comedy),(Sci-Fi),0.603448,0.465517,0.293103,0.485714,1.043386,0.012188,1.039272
884,(Drama),"(Adventure, Action, Sci-Fi)",0.672414,0.379310,0.258621,0.384615,1.013986,0.003567,1.008621
873,"(Adventure, Action, Sci-Fi)",(Drama),0.379310,0.672414,0.258621,0.681818,1.013986,0.003567,1.029557
125,(Drama),"(Adventure, Action)",0.672414,0.431034,0.293103,0.435897,1.011282,0.003270,1.008621


In [14]:
te = TransactionEncoder()
te_ary = te.fit(df_tags_list).transform(df_tags_list)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [15]:
apriori_frequent_itemsets = apriori(df, min_support=0.62, use_colnames=True, max_len = 8)
apriori_frequent_itemsets.head()
print(apriori_frequent_itemsets.size)

6650


In [16]:
rules_tags = association_rules(apriori_frequent_itemsets,metric="lift",min_threshold=0.8)
rules_tags = rules_tags.sort_values(by=['lift'], ascending=False)

In [17]:
rules_tags

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
97900,"(psychological, atmospheric, classic, stylized)","(quirky, thought-provoking, imdb top 250, action)",0.625000,0.631944,0.621528,0.994444,1.573626,0.226562,66.250000
97837,"(quirky, thought-provoking, imdb top 250, action)","(psychological, atmospheric, classic, stylized)",0.631944,0.625000,0.621528,0.983516,1.573626,0.226562,22.750000
99558,"(quirky, twist ending, thought-provoking, imdb...","(psychological, atmospheric, stylized)",0.625000,0.631944,0.621528,0.994444,1.573626,0.226562,66.250000
99735,"(psychological, atmospheric, stylized)","(quirky, twist ending, thought-provoking, imdb...",0.631944,0.625000,0.621528,0.983516,1.573626,0.226562,22.750000
82949,"(atmospheric, dark, stylized)","(quirky, psychological, imdb top 250, action)",0.628472,0.628472,0.621528,0.988950,1.573578,0.226550,33.623264
...,...,...,...,...,...,...,...,...,...
396,(suspense),(time travel),0.798611,0.732639,0.638889,0.800000,1.091943,0.053795,1.336806
401,(time travel),(thought-provoking),0.732639,0.795139,0.635417,0.867299,1.090751,0.052867,1.543775
400,(thought-provoking),(time travel),0.795139,0.732639,0.635417,0.799127,1.090751,0.052867,1.330993
379,(twist ending),(sci-fi),0.791667,0.750000,0.638889,0.807018,1.076023,0.045139,1.295455


# Prediction

In [18]:
from difflib import SequenceMatcher

def similar(a, b):
    a = a.replace(',', '')
    b = b.replace(',', '')
    f = list(map(str,a.split()))
    s = list(map(str,b.split()))
    ff = []
    ss = []
    for i in f:
      if('(' in i): continue
      else: ff.append(i)
    
    for i in s:
      if('(' in i): continue
      else: ss.append(i)

    ff = set(ff); ss = set(ss)
    return len(ff&ss)/len(ff|ss)



In [19]:
df_temp = df_movies.merge(df_tags, on='movieId', how = 'inner')
for i in range(len(df_temp['genres'])):
  temp = list(set(list(map(str,df_temp['genres'][i].split('|')))))
  df_temp['genres'][i] = temp


movies_to_genre = df_temp.groupby(by = 'title')['genres'].apply(list).reset_index()
movies_to_tag = df_temp.groupby(by = 'title')['tag'].apply(list).reset_index()
df_temp.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,toy story (1995),"[Fantasy, Animation, Adventure, Comedy, Children]",336,pixar,1139045764
1,1,toy story (1995),"[Fantasy, Animation, Adventure, Comedy, Children]",474,pixar,1137206825
2,1,toy story (1995),"[Fantasy, Animation, Adventure, Comedy, Children]",567,fun,1525286013
3,2,jumanji (1995),"[Fantasy, Adventure, Children]",62,fantasy,1528843929
4,2,jumanji (1995),"[Fantasy, Adventure, Children]",62,magic board game,1528843932


In [20]:
for i in range(len(movies_to_genre['genres'])):
  movies_to_genre['genres'][i] = movies_to_genre['genres'][i][0]

movies_to_genre.head()

Unnamed: 0,title,genres
0,(500) days of summer (2009),"[Comedy, Romance, Drama]"
1,...and justice for all (1979),"[Thriller, Drama]"
2,10 cloverfield lane (2016),[Thriller]
3,10 things i hate about you (1999),"[Comedy, Romance]"
4,101 dalmatians (1996),"[Adventure, Comedy, Children]"


In [21]:
for i in range(len(movies_to_tag['tag'])):
  movies_to_tag['tag'][i] = movies_to_tag['tag'][i][0]

movies_to_tag.head()

Unnamed: 0,title,tag
0,(500) days of summer (2009),artistic
1,...and justice for all (1979),lawyers
2,10 cloverfield lane (2016),creepy
3,10 things i hate about you (1999),Shakespeare sort of
4,101 dalmatians (1996),dogs


In [22]:
for i in range(len(movies_to_genre['title'])):
  movies_to_genre['title'][i] = movies_to_genre['title'][i].lower()
movies_to_genre.head()

Unnamed: 0,title,genres
0,(500) days of summer (2009),"[Comedy, Romance, Drama]"
1,...and justice for all (1979),"[Thriller, Drama]"
2,10 cloverfield lane (2016),[Thriller]
3,10 things i hate about you (1999),"[Comedy, Romance]"
4,101 dalmatians (1996),"[Adventure, Comedy, Children]"


In [23]:
for i in range(len(movies_to_tag['title'])):
  movies_to_tag['title'][i] = movies_to_tag['title'][i].lower()
movies_to_tag.head()

Unnamed: 0,title,tag
0,(500) days of summer (2009),artistic
1,...and justice for all (1979),lawyers
2,10 cloverfield lane (2016),creepy
3,10 things i hate about you (1999),Shakespeare sort of
4,101 dalmatians (1996),dogs


In [24]:
def find_genre(movies):
  curr_genre = []
  for i in movies:
    done = False
    for j in range(len(movies_to_genre['title'])):
      if(i in movies_to_genre['title'][j]):
        done = True
        print(movies_to_genre['title'][j])
        for k in movies_to_genre['genres'][j]: curr_genre.append(k.lower())

    if(not done):
      movie_index = -1
      curr_similarity = -1
      for j in range(len(movies_to_genre['title'])):
        if(similar(movies_to_genre['title'][j],i) > curr_similarity):
          movie_index = j
          curr_similarity = similar(movies_to_genre['title'][j],i)

      print(movies_to_genre['title'][movie_index],'===')
      for k in movies_to_genre['genres'][movie_index]: curr_genre.append(k.lower()) 


  temp = []
  for gen in curr_genre:
    temp += list(rules_genre[rules_genre["antecedents"].apply(lambda x: gen in str(x).lower())].sort_values(ascending=False,by='lift')['consequents'])
  
  temp = list(set(temp))
  
  new_genre = []
  for i in range(min(4,len(temp))):
    curr = str(temp[i])
    curr = (curr[11: len(curr)-2])
    curr = list(map(str,curr.split(',')))
    for j in range(len(curr)):
      curr[j] = curr[j].strip()
      curr[j] = curr[j][1:len(curr[j])-1]
    for j in curr: new_genre.append(j.lower())
  
  curr_genre += new_genre
  
  cnt_genre = {}
  for i in curr_genre:
    if(i in cnt_genre): cnt_genre[i]+=1
    else: cnt_genre[i] = 1
  
  curr_genre = list(set(curr_genre))
  return curr_genre, cnt_genre

In [25]:
def find_tags(movies):
  
  curr_tags = []
  for i in movies:
    done = False
    for j in range(len(movies_to_tag['title'])):
      if(i in movies_to_tag['title'][j]):
        done = True
        # print(movies_to_tag['title'][j])
        # curr_tags.append(movies_to_tag['tag'][j].lower())
        tit = movies_to_tag['title'][j]
        id = title_to_id[tit]
        if(id in movies_to_tags): curr_tags += movies_to_tags[id]

    if(not done):
      movie_index = -1
      curr_similarity = -1
      for j in range(len(movies_to_tag['title'])):
        if(similar(movies_to_tag['title'][j],i) > curr_similarity):
          movie_index = j
          curr_similarity = similar(movies_to_tag['title'][j],i)

      # print(movies_to_tag['title'][movie_index],'=====')
      # curr_tags.append(movies_to_tag['tag'][movie_index].lower())
      tit = movies_to_tag['title'][movie_index]
      id = title_to_id[tit]
      if(id in movies_to_tags): curr_tags += movies_to_tags[id]


  input_tags = set(curr_tags)
  maximal_intersection = 0
  best_pred_tags = set()

  # iterating through rules

  for i in rules_tags.values:
      temp_intersection = 0
      for j in i[0]:
          if j in input_tags:
            temp_intersection += 1

      if(temp_intersection > maximal_intersection):
        maximal_intersection = temp_intersection
        best_pred_tags = set()
        for j in i[1]:
            best_pred_tags.add(j) 


  for i in best_pred_tags:
      input_tags.add(i)

  # temp = []
  # for tg in curr_tags:
  #   temp += list(rules_tags[rules_tags["antecedents"].apply(lambda x: tg in str(x).lower())].sort_values(ascending=False,by='lift')['consequents'])
  
  # temp = list(set(temp))

  new_tags = list(input_tags)

  
  # new_tags = []
  # for i in range(4):
  #   curr = str(temp[i])
  #   curr = (curr[12: len(curr)-3])
  #   new_tags.append(curr.lower())
  
  # curr_tags += new_tags
  
  cnt_tags = {}
  for i in curr_tags:
    if(i in cnt_tags): cnt_tags[i]+=1
    else: cnt_tags[i] = 1
  
  for i in new_tags:
    if(i not in cnt_tags): cnt_tags[i] = 1
  
  curr_tags += list(set(new_tags))
  curr_tags = list(set(curr_tags))
  return curr_tags, cnt_tags

In [26]:
print(find_tags(['godfather']))

(['quirky', 'francis ford coppola', 'in netflix queue', 'thought-provoking', 'classic', 'andy garcia', 'imdb top 250', 'action', 'al pacino', 'mafia'], {'mafia': 4, 'al pacino': 2, 'andy garcia': 1, 'classic': 1, 'francis ford coppola': 1, 'in netflix queue': 1, 'quirky': 1, 'thought-provoking': 1, 'imdb top 250': 1, 'action': 1})


In [27]:
def scoring(curr_genre, cnt_genre, curr_tags, cnt_tags, movies):
  #scoring part ->  +15 for same tag and +5 for same genre.
  
  df = movies_to_genre.merge(movies_to_tag, on='title', how = 'inner')
  df.head()
  
  scores = {}
  
  for i in range(len(df['title'])):
    sc = 0
    for j in df['genres'][i]:
      times = 0
      for k in curr_genre: 
        if(j.lower() in k): times+=cnt_genre[k]
      sc+=times * 5
    j = df['tag'][i]
    times = 0
    for k in curr_tags:
      if(j.lower() in k): 
        times += cnt_tags[k]
  
    sc+=times * 15
  
    for j in movies:
      sc += similar(j,df['title'][i]) * 20
    scores[df['title'][i]] = sc
  
  scores = dict(sorted(scores.items(), key=lambda item: -item[1]))
  return scores

In [28]:
def find_recommendation(scores, movies):
  ans = []
  
  for i in scores:
    can_take = 1
    for j in movies:
      if(similar(i,j) >= 0.9): can_take = 0
    if(can_take): ans.append(i)
    if(len(ans) == 4): break

  return ans

In [29]:
#read from csv files all the movies.

df_temp = pd.read_csv('sample_test.tsv',sep='\t')
df_temp.head()

movies = []
for i in range(len(df_temp)):
  temp = list(map(str,df_temp['movies'][i].split('\n')))
  movies.append(temp)

for i in range(len(movies)):
  for j in range(len(movies[i])):
    movies[i][j] = movies[i][j].lower()
 
movies

[['the godfather (1969)'],
 ['the dark knight (2008)', 'the dark knight rises (2012)'],
 ['jfk (1991)',
  'the file on thelma jordon (1950)',
  'a love song for bobby long (2004)'],
 ['bobby (1973)'],
 ['little miss broadway (1938)'],
 ['frankenstein meets the wolf man (1943)'],
 ['american movie (1999)',
  'collapse (2006)',
  'revenge of the green dragons (2014)',
  'pain & gain (2013)'],
 ['the mambo kings (1992)'],
 ['the joneses (2009)']]

In [30]:
recommendations = []
for i in range(len(movies)):
  curr_genre,cnt_genre = find_genre(movies[i])
  # print(curr_genre)
  # print(cnt_genre)
  curr_tags,cnt_tags = find_tags(movies[i])
  # print(curr_tags)
  # print(cnt_tags)
  scores = scoring(curr_genre,cnt_genre,curr_tags,cnt_tags,movies[i])
  recommendation = find_recommendation(scores, movies[i])
  print(movies[i])
  print(recommendation)
  recommendations.append(recommendation)
  # print("Scores =================")
  # print(scores)
  #now put this into a csv file according to output

godfather, the (1972) ===
['the godfather (1969)']
['out of sight (1998)', 'negotiator, the (1998)', 'beat the devil (1953)', 'charade (1963)']
dark knight, the (2008) ===
dark knight rises, the (2012) ===
['the dark knight (2008)', 'the dark knight rises (2012)']
['batman forever (1995)', 'batman (1989)', 'inception (2010)', 'negotiator, the (1998)']
jfk (1991)
on the waterfront (1954) ===
requiem for a dream (2000) ===
['jfk (1991)', 'the file on thelma jordon (1950)', 'a love song for bobby long (2004)']
['inception (2010)', '21 grams (2003)', 'out of sight (1998)', 'negotiator, the (1998)']
searching for bobby fischer (1993) ===
['bobby (1973)']
['out of sight (1998)', 'charade (1963)', 'femme nikita, la (nikita) (1990)', 'inception (2010)']
little voice (1998) ===
['little miss broadway (1938)']
['21 grams (2003)', 'charade (1963)', 'how to steal a million (1966)', 'out of sight (1998)']
elephant man, the (1980) ===
['frankenstein meets the wolf man (1943)']
['out of sight (1998)'

In [31]:
import csv
fieldnames = ['movies',"recommendation"]
rows = []

for i in range(len(movies)):
  r_names = ""
  for j in range(4):
    r_names += recommendations[i][j]
    if(j != 3): r_names += '\n'

  m_names = ""
  for j in range(len(movies[i])):
    m_names += movies[i][j]
    if(j != len(movies[i])-1): m_names += '\n'
  
  # print(m_names)
  # print(r_names)
  # print()
  temp = {fieldnames[0]: m_names, fieldnames[1]: r_names}
  rows.append(temp)

print(rows)

with open('output.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

[{'movies': 'the godfather (1969)', 'recommendation': 'out of sight (1998)\nnegotiator, the (1998)\nbeat the devil (1953)\ncharade (1963)'}, {'movies': 'the dark knight (2008)\nthe dark knight rises (2012)', 'recommendation': 'batman forever (1995)\nbatman (1989)\ninception (2010)\nnegotiator, the (1998)'}, {'movies': 'jfk (1991)\nthe file on thelma jordon (1950)\na love song for bobby long (2004)', 'recommendation': 'inception (2010)\n21 grams (2003)\nout of sight (1998)\nnegotiator, the (1998)'}, {'movies': 'bobby (1973)', 'recommendation': 'out of sight (1998)\ncharade (1963)\nfemme nikita, la (nikita) (1990)\ninception (2010)'}, {'movies': 'little miss broadway (1938)', 'recommendation': '21 grams (2003)\ncharade (1963)\nhow to steal a million (1966)\nout of sight (1998)'}, {'movies': 'frankenstein meets the wolf man (1943)', 'recommendation': 'out of sight (1998)\ncharade (1963)\nfemme nikita, la (nikita) (1990)\ninception (2010)'}, {'movies': 'american movie (1999)\ncollapse (200

Saving Models

In [32]:
import pickle
import os
filename1 = "genre"
filename2 = "tags"

# if os.path.exists(filename1):
#   with open(filename1,'rb') as file:
#     model = pickle.load(file)

with open(filename1,'wb+') as file:
  pickle.dump(rules_genre,file)

# if os.path.exists(filename2):
#   with open(filename2,'rb') as file:
#     model = pickle.load(file)

with open(filename2,'wb+') as file:
  pickle.dump(rules_tags,file)