# LIBRARY

In [11]:
!pip install pyspark



In [12]:
import pyspark
import pandas as pd
import numpy as np
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from google.colab import drive
import time
from pyspark.sql import SQLContext
drive.mount('/content/drive')
import collections
from pyspark.mllib.linalg import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.mllib.linalg import SparseVector
from scipy.spatial import distance
import json

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#LOAD DATASET

In [13]:
spark = SparkSession.builder.appName('MOVIES_RECOMMENDATION_SYSTEM').getOrCreate()
# load data 
smd = spark.read.csv('/content/drive/MyDrive/BIGDATA/DOAN/NEWDATASET/movies_metadata.csv', header = True, inferSchema = True)
smd.printSchema()
begin1 = time.time()

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

# DATA PREPROCESSING

In [14]:
smd = smd.withColumn("id", smd.id.cast("int"))
smd.printSchema()

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (n

In [15]:
# smd = smd.fillna({'tagline':'', 'overview':"", 'keywords':'', 'cast':'', 'director':'', 'genres':''})
smd = smd.fillna({'tagline':'', 'overview':""})
smd.show()

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------+--------------------+-----------------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|        release_date|             revenue|             runtime|    spoken_languages|  status|             tagline|               title|   video|        vote_average|       vote_count|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+-----

In [16]:
from pyspark.sql.functions import concat, lit
# col_list = ['tagline','overview','keywords','cast', 'director', 'genres']
col_list = ['tagline','overview']
smd = smd.withColumn('description',concat(*col_list))
smd.show()

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------+--------------------+-----------------+--------------------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|        release_date|             revenue|             runtime|    spoken_languages|  status|             tagline|               title|   video|        vote_average|       vote_count|         description|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+----------------

# TF-IDF VECTORIZER

In [17]:
from pyspark.ml.feature import CountVectorizer,RegexTokenizer

regexTokenizer = RegexTokenizer(inputCol="description", outputCol="words", pattern="\\W")
wordsData = regexTokenizer.transform(smd)
#n-gram
# from pyspark.ml.feature import NGram

# ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

# ngramDataFrame = ngram.transform(wordsData)
# ngramDataFrame.select("ngrams").show()

hashingTF = HashingTF(inputCol="words", outputCol="tfidf", numFeatures=1000)
featurizedData = hashingTF.transform(wordsData)

# calculate scores
idf = IDF(inputCol="tfidf", outputCol="tfidf_features")
idf_model = idf.fit(featurizedData)
wordsData = idf_model.transform(featurizedData)
wordsData.show()
end1 = time.time() - begin1

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|        release_date|             revenue|             runtime|    spoken_languages|  status|             tagline|               title|   video|        vote_average|       vote_count|         description|               words|               tfidf|      tfidf_features|
+---

# COSINE SIMILARITY

In [18]:
def cosin(x, y):
  return x.dot(y)/(x.norm(2)*y.norm(2))

def recommend(y):
  b = wordsData.rdd.map(lambda x: (x["tfidf_features"], x['original_title']))
  result = b.map(lambda x:([cosin(x[0],y)], x[1]))
  return result.takeOrdered(11, key = lambda x: -x[0][0])

In [19]:
def get_recommendations(title):
  movies = wordsData.where(f"UPPER(original_title) like UPPER('%{title}%')").select(["original_title","id","tfidf_features"]).distinct().collect()
  if len(movies) > 0:
    print(f'Có {len(movies)} bộ phim được tìm thấy')
  for i,m in enumerate(movies):
      print(f"{i+1} --- {m.original_title}")

  id_select = int(input("Chọn một bộ phim: "))
  movie = movies[id_select - 1]

  print(f"Phim được chọn: {movie[0]}\n----------------------------\n Phim được đề xuất")
  begin2 = time.time()
  recommend_result = recommend(movie[2])
  end2 = time.time() - begin2
  print("Time:",end1 + end2)

  recommend_result = recommend_result[1:11]

  result_df = pd.DataFrame(recommend_result,columns=['Similarity','Movies'])
  return result_df
  # return recommend_result

# RESULT: GET MOVIES RECOMMENDATION - HARRY POTTER

In [20]:
get_recommendations('Harry Potter')

Có 8 bộ phim được tìm thấy
1 --- Harry Potter and the Goblet of Fire
2 --- Harry Potter and the Deathly Hallows: Part 2
3 --- Harry Potter and the Chamber of Secrets
4 --- Harry Potter and the Philosopher's Stone
5 --- Harry Potter and the Half-Blood Prince
6 --- Harry Potter and the Order of the Phoenix
7 --- Harry Potter and the Prisoner of Azkaban
8 --- Harry Potter and the Deathly Hallows: Part 1
Chọn một bộ phim: 8
Phim được chọn: Harry Potter and the Deathly Hallows: Part 1
----------------------------
 Phim được đề xuất
Time: 23.140868425369263


Unnamed: 0,Similarity,Movies
0,[0.38392630701526054],Harry Potter and the Goblet of Fire
1,[0.3211127291549875],Darling Companion
2,[0.31715947661047517],Harry Potter and the Deathly Hallows: Part 2
3,[0.3087703195646179],Below Sea Level
4,[0.2992543434890379],نان و کوچه‎‎
5,[0.28980883871326335],Harry Potter and the Order of the Phoenix
6,[0.2785950868427183],싸이보그지만 괜찮아
7,[0.2780995639190203],The King and I
8,[0.2663755240802338],Graduation
9,[0.26318088688399727],The Dog Problem
