I use HTMLParser in Python standart libraries to scrap IMDB data. I examine the page source I found where title, storyline and recommendations are. <br>
Titles are after '<div class="title_wrapper" ...>' <br>
Storylines after '<div ... id="titleStoryLine">' <br>
Recommandations are after '<div class='rec_item' ..>'

In [1]:
import requests
from html.parser import HTMLParser
import csv
import os
import math

class MyParser(HTMLParser):
    def __init__(self):
        self.title_begin = 0
        self.story_h_line = 0
        self.story_begin_line = 0
        self.rec_movies = []
        self.storyline = ""
        self.title = ""
        super().__init__()
    
    def handle_starttag(self, tag, attrs):
        if tag == 'div' and len(attrs) > 0 and attrs[0][0] == 'class' and attrs[0][1] == 'title_wrapper':
            self.title_begin = self.getpos()[0]
        if tag == 'span' and self.story_begin_line == 0  and self.story_h_line > 0 and self.getpos()[0] > self.story_h_line and len(attrs) == 0:
            self.story_begin_line = self.getpos()[0]
        if tag == 'div' and len(attrs) > 0 and attrs[0][0] == 'class' and attrs[0][1] == 'rec_item':
            self.rec_movies.append(attrs[3][1])
        if tag == 'div' and len(attrs) > 1 and attrs[1][0] == 'id' and attrs[1][1] == 'titleStoryLine':
            self.story_h_line = self.getpos()[0]
    def get_title(self):
        return self.title.strip()
    def handle_data(self, data):
        if self.getpos()[0] == self.story_begin_line:
            self.storyline = self.storyline +  data
        if self.title_begin > 0 and self.getpos()[0] >= self.title_begin and self.title == "":
            self.title = data.strip()
    def get_rec_movies(self):
        return self.rec_movies
    def get_story_line(self):
        return self.storyline.strip()

# Preprocess
I use punctuations file from first project to eliminate punctuations. I translate all letters to lowercase

In [2]:
def preprocess(string):
    global dic
    string = string.translate(dic)
    return string.lower()

## IMDB Scraping
Get HTML text of movies' page and scrap with my HTML parser

In [3]:
def get_movie_contents(imdb_id):
    """
    Gets an imdb id and returns its title, storyline, list of IMDB recommendations respectively.
    """
    x = requests.get('https://www.imdb.com/title/'+imdb_id)
    parser = MyParser()
    parser.feed(x.text)
    parser.close()
    return (parser.get_title(), parser.get_story_line(),  parser.get_rec_movies())

## Tf-idf model

## Recommendation
I only use values in document vector that higher than the threshold value. <br>
Returns sorted list of IMDB ids.

In [4]:
def recommend(imdb_id):
    """
    Gets an imdb id and returns a list of recommended movie ids for that movie. 
    """
    global N
    global id_list
    scores = []
    for id in id_list:
        cosine_score = 0.0
        for j in tf_idf_above_threshold[imdb_id]: # Only take some scores in calculation
            cosine_score += tf_idf[id][j]*tf_idf[imdb_id][j]
        scores.append(cosine_score)
    rec_movies = list(range(0,N))
    rec_movies = sorted(rec_movies , key = lambda x:scores[x], reverse = True)
    rec_movies = [id_list[x] for x in rec_movies]
    rec_movies.remove(imdb_id) # Get rid of movie itself
    return rec_movies

## Evaluation

In [5]:
def evaluate_recommendations(rec_movie_ids, relevant_movie_ids, K):
    """
    Gets list of recommended and relevant movie ids and K value.
    
    Returns precision, recall, F1 values for K respectively. 
    """
    rec_movie_ids = rec_movie_ids[:K] # Get top K 
    print("Recommendation by program: ")
    print(rec_movie_ids)
    print("Recommendation by IMDB")
    print(relevant_movie_ids)
    precision = len([x for x in rec_movie_ids if x in relevant_movie_ids])/len(rec_movie_ids)
    recall = len([x for x in relevant_movie_ids if x in rec_movie_ids])/len(relevant_movie_ids)
    print("K : " + str(K))
    print("Precision : ", precision)
    print("Recall : ", recall)
    try:
        f1_score = 2.0*precision*recall/(precision+recall)
    except ZeroDivisionError as err: # If both precision and recall is zero than f1_score is undefined.
        print("Error : " ,err)
        f1_score = math.nan
    print("F1 Score : ", f1_score)
    return (precision,recall,f1_score)

# Threshold
I use threshold. I choose threshold value such that I take ~%85 of nonzero scores after eliminate values below the threshold

In [6]:

if not os.path.isfile('./movies.txt'): # Save IMDB data to use in future
    print("Handling IMDB data please wait...")
    with open('./movie_ids.csv' , newline='') as id_file:
        file = open('movies.txt' , 'w')
        ids = csv.reader(id_file)
        for row in ids:
            (a,b,c) = get_movie_contents(row[0])
            file.write(a)
            file.write('\n')
            file.write(b)
            file.write('\n')
            for s in c:
                file.write(s + ' ')
            file.write('\n')
dic = {'"': ' ', '!': ' ', '^': ' ', '%': ' ', '<': ' ', '+': ' ', '~': ' ', '*': ' ', ';': ' ', ':': ' ', '(': ' ', '?': ' ', '&': ' ', '}': ' ', ']': ' ', '|': ' ', ',': ' ', "'": ' ', ')': ' ', '-': ' ', '#': ' ', '`': ' ', '@': ' ', '/': ' ', '$': ' ', '_': ' ', '{': ' ', '.': ' ', '>': ' ', '[': ' ', '\\': ' ', '=': ' '}
dic = str.maketrans(dic)
id_list = [] # list of the ids of the movies
file = open('./movies.txt','r')
with open('./movie_ids.csv' , newline='') as id_file:
    ids = csv.reader(id_file)
    for row in ids:
        id_list.append(row[0])
i = 0
N = 0
ground_truth = {} # Stores recommended movies by IMDB
df = {} 
tf = {}
vocabulary = set()
title_plus_storyline = {}
s = ''
for line in file.readlines():
    id_ = id_list[N]
    if i == 0:
        s = line #Keep the title
        i = 1
    elif i == 1:
        title_plus_storyline[id_] = preprocess(s + ' ' + line) # Merge title and storyline and preprocess
        tf[id_] = {}
        for x in title_plus_storyline[id_].split(): # create term freqs
            if tf[id_].get(x) is None:
                tf[id_][x] = 0
            tf[id_][x] = tf[id_][x] + 1 
            vocabulary.add(x)
        i = 2
    elif i == 2:
        i = 0
        line = line.strip()
        line = line.split()
        ground_truth[id_] = []
        for x in line: # save ground truths
            ground_truth[id_].append(x)
        N = N + 1
for val in tf.values(): #calculates document freqs
    for key in val.keys():
        if df.get(key) is None:
            df[key] = 0
        df[key] = df[key] + 1
vocabulary = list(vocabulary)
idf = {}
for word in vocabulary: #calculate inverse document freqs
    idf[word] = math.log(float(N/df[word]) , 10)

tf_idf = {}
tf_idf_len = {}
threshold = 0.6 #Threshold value
tf_idf_above_threshold = {}
for i in range(0,N):
    id_ = id_list[i]
    tf_idf[id_] = []
    tf_idf_len[id_] = 0.0
    for word in vocabulary:
        if tf[id_].get(word) is None: # if a word isn't in the document then its tf_idf score is zero
            tf_idf[id_].append(0)
        else:
            score = (1.0 + math.log(tf[id_][word],10))*idf[word] # calculates scores
            tf_idf[id_].append(score)
            tf_idf_len[id_] += score**2 #calculations for normalization
    tf_idf_above_threshold[id_] = [] # list of words that has higher score than threshold
    for j in range(0,len(vocabulary)):
        length = math.sqrt(tf_idf_len[id_]) # lenght of the vector
        if tf_idf[id_][j] > threshold:
            tf_idf_above_threshold[id_].append(j) # save the word
            tf_idf[id_][j] = float(tf_idf[id_][j]/length) # normalized vector
        else:
            tf_idf[id_][j] = 0.0 # ignore values below threshold

In [7]:
movie_id = 'tt0068481'
rec = recommend(movie_id)
for K in [1,2,3,10]:
    evaluate_recommendations(rec , ground_truth[movie_id] , K)


Recommendation by program: 
['tt0075334']
Recommendation by IMDB
['tt0067221', 'tt0068708', 'tt0067721', 'tt0067744', 'tt0067828', 'tt0069461', 'tt0072217', 'tt0073662', 'tt0080079', 'tt0067977', 'tt0070633', 'tt0080853']
K : 1
Precision :  0.0
Recall :  0.0
Error :  float division by zero
F1 Score :  nan
Recommendation by program: 
['tt0075334', 'tt0082406']
Recommendation by IMDB
['tt0067221', 'tt0068708', 'tt0067721', 'tt0067744', 'tt0067828', 'tt0069461', 'tt0072217', 'tt0073662', 'tt0080079', 'tt0067977', 'tt0070633', 'tt0080853']
K : 2
Precision :  0.0
Recall :  0.0
Error :  float division by zero
F1 Score :  nan
Recommendation by program: 
['tt0075334', 'tt0082406', 'tt0070297']
Recommendation by IMDB
['tt0067221', 'tt0068708', 'tt0067721', 'tt0067744', 'tt0067828', 'tt0069461', 'tt0072217', 'tt0073662', 'tt0080079', 'tt0067977', 'tt0070633', 'tt0080853']
K : 3
Precision :  0.0
Recall :  0.0
Error :  float division by zero
F1 Score :  nan
Recommendation by program: 
['tt0075334'