# List of Packages required for Scrapping and Analysis

In [1]:
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import warnings
import re

import os
import requests
from PIL import Image
from io import BytesIO
from urllib.parse import urlparse
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

## Instantiate the data required in lists

In [2]:
#User List can be put into a list but we are extracting 1 by 1 for better and efficient results
users_list = ['ur117926588']
# ur22171966 1910 ratings
# ur46592925 4 ratings
# ur62522856 338 ratings
user_id = []
title_list = []
description_list = []
img_list = []
year_list = []
director_list = []
star_list = []
duration_list = []
advisory_list = []
genre_list = []
vote_list = []
movie_rating_list = []
user_rating_list = []
img_file_list = []
error_msg = []

record_id = 1
record_list = []

**The code that runs through IMDB ratings website for the particular users and extract important information required for our analysis

In [3]:
for user in range(len(users_list)):
    print("Scraping for User {}".format(user+1))
    driver = webdriver.Chrome(ChromeDriverManager().install())
    url = 'https://www.imdb.com/user/{}/ratings'.format(users_list[user])
    time.sleep(1)
    driver.get(url)
    time.sleep(1)
    
    sel = Selector(text = driver.page_source)
    num_of_ratings = sel.css(".lister-list-length span::text").extract_first().replace(',','').split(' ')[0]
    rating_pages = int(int(num_of_ratings)/100) + 1  
    user_id += [users_list[user] for i in range(int(num_of_ratings))]
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    count = 0
    for x in range(rating_pages):
        sel = Selector(text = driver.page_source)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.lister-item.mode-detail')
        
        # filter out the images that have a title attribute with the value "list image"
        ##Get img url
        image_tags = soup.find_all('img', {'class': 'loadlate', 'title': lambda value: value is None or value != 'list image'})
        for img_tag in image_tags:
            # Get the URL of the movie poster image
            img_url = img_tag['loadlate']
            img_list.append(img_url)
        
        for review in tqdm(reviews):
            
            try:
                sel2 = Selector(text = review.get_attribute('innerHTML'))
                
                ## Get movie title
                try:
                    title = sel2.css('.lister-item-content a::text').extract_first().strip()
                    episode = sel2.css('.lister-item-content a::text').getall()[1].strip()
                    if episode != "":
                        title += (" - " + episode)
                except:
                    title= np.NaN
                    
                ## Get movie description
                try:
                    advisory = sel2.css('.certificate::text').extract_first()
                    duration = sel2.css('.runtime::text').extract_first()
                    description = None
                    if advisory == None and duration == None:
                        description = sel2.css('p::text').getall()[3].strip()
                    elif advisory == None or duration == None:
                        description = sel2.css('p::text').getall()[5].strip()
                    else:
                        description = sel2.css('p::text').getall()[7].strip()
                except:
                    description = np.NaN
                ## Get movie year
                try:
                    year = sel2.css('.lister-item-year.text-muted.unbold::text').extract_first().strip().replace('(','').replace(')','')
                    year = re.sub(r'[a-zA-Z\s]+', '', year)
                except:
                    year = np.NaN
                ## Get directors and staff
                try:
                    staff = sel2.css('.text-muted a::text').getall()
                    text = sel2.css('.text-muted.text-small::text').getall()
                    text2 = [x.strip() for x in text]
                    commas = text2.count(',')
                    stars_index = text2.index("Stars:")
                    count = 0
                    for i in range(stars_index, len(text2)-1):
                        if text2[i] == ',':
                            count+=1
                    stars = staff[-(count+1):]
                    # if directors are recorded
                    if "Director:" in text2 or "Directors:" in text2:
                        directors = staff[:(commas-count)+1]
                    else:
                        directors = ""
                except:
                    stars = np.NaN
                    directors = np.NaN
                ## Get movie duration
                try:
                    duration = duration.strip()
                except:
                    duration = np.NaN
                ## Get viewer advisory
                try:
                    advisory = advisory.strip()
                except:
                    advisory = np.NaN
                ## Get Genre
                try:
                    genre = sel2.css('.genre::text').extract_first().strip()
                except:
                    genre = np.NaN
                ## Get votes
                try:
                    votes = sel2.css('.text-muted.text-small span::text').getall()[-1]
                    votes = int(votes.replace(',','').split(' ')[0])
                except:
                    votes = np.NaN
                ## Get movie rating
                try:
                    movie_rating = sel2.css('.ipl-rating-star__rating::text').getall()[0]
                    movie_rating = float(movie_rating.replace(',','').split(' ')[0])
                except:
                    movie_rating = np.NaN
                ## Get user rating
                try:
                    user_rating = sel2.css('.ipl-rating-star__rating::text').getall()[1]
                    user_rating = int(user_rating.replace(',','').split(' ')[0])
                except:
                    user_rating = np.NaN
                    
                try:
                    title_rename = re.sub(r'[^\w\s!-]|[.!?]', '', title)
                    img_name = f"{user_id[0]}_{record_id}.jpg"
                except:
                    img_name = np.NaN
                

                
                
                title_list.append(title)
                description_list.append(description)
                year_list.append(year)
                director_list.append(directors)
                star_list.append(stars)
                duration_list.append(duration)
                advisory_list.append(advisory)
                genre_list.append(genre)
                vote_list.append(votes)
                movie_rating_list.append(movie_rating)
                user_rating_list.append(user_rating)
                img_file_list.append(img_name)
                record_list.append(record_id)
                record_id  += 1
                
            except Exception as e:
                error_msg.append(e)
        
        try:
            next_page_url = sel.css("a.flat-button.lister-page-next.next-page::attr(href)").extract_first()
            full_next_page_url = "https://www.imdb.com" + next_page_url
            driver.get(full_next_page_url)
            response = requests.get(full_next_page_url)
            soup = BeautifulSoup(response.content, 'html.parser')
        except:
            print("No more pages to browse")

Scraping for User 1


100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 107.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 90.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 84.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 86.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 84.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 91.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 98.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 93.90it/s]
100%|███████████████████████████████████

No more pages to browse





**Storing the data into a dataframe and exporting it to a csv file.**

In [4]:
# Storing all data in dataframe
rating_df = pd.DataFrame({
     "UserID": user_id,
     "record_id": record_list,
     "Title":title_list,
     "Img_Path": img_list,
     "Img_File_Name": img_file_list,
     "Year":year_list,
     "Description":description_list,
     "Directors":director_list,
     "Stars": star_list,
     "Viewer_Advisory": advisory_list,
     "Duration": duration_list,
     "Genre": genre_list,
     "Votes": vote_list, 
     "Movie_Rating": movie_rating_list,
     "User_Rating": user_rating_list})

In [5]:
# Export dataset
rating_df.to_csv(path_or_buf = f"{user_id[0]}_ratings.csv"
                          , index = False)

## Download the Images to the folder 

In [6]:
##image downloading
def download_image(url, save_path):
    try:
        response = requests.get(url)
        response.raise_for_status()

        img = Image.open(BytesIO(response.content))


        img_save_path = save_path
        img.save(img_save_path, format="JPEG")

        #print(f"Image saved at {img_save_path}")
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")

# Folder for downloading
img_folder = f"{user_id[0]}_downloaded_images"
os.makedirs(img_folder, exist_ok=True)

for idx, row in rating_df.iterrows():
    user_id = row['UserID']
    title = row['Title']
    img_url = row['Img_Path']
    record_id = row["record_id"]
    
    #title = re.sub(r'[^\w\s!-]|[.!?]', '', title)

    # File extension name
    img_filename = f"{user_id}_{record_id}.jpg"
    img_save_path = os.path.join(img_folder, img_filename)

    # Download and save the image
    download_image(img_url, img_save_path)
    
    #See image in python
    #img = Image.open(img_save_path)
    #plt.imshow(img)
    #plt.savefig(img_save_path)
    #plt.show()

## Image Analysis using KMeans Clustering (3 main colours and Brightness)

In [7]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd
import webcolors

def get_main_colors(image, k=3):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = image.reshape(image.shape[0] * image.shape[1], 3)
    
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(image)
    
    return kmeans.cluster_centers_

def get_brightness(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    r, g, b = np.split(image, 3, axis=-1)
    brightness = 0.299 * r + 0.587 * g + 0.114 * b
    return np.mean(brightness)

def analyze_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to read image '{image_path}'")
        return None, None, None, None
    main_colors = get_main_colors(image)
    color1 = webcolors.rgb_to_hex(main_colors[0].astype(int))
    color2 = webcolors.rgb_to_hex(main_colors[1].astype(int))
    color3 = webcolors.rgb_to_hex(main_colors[2].astype(int))
    brightness = get_brightness(image)
    return color1, color2, color3, brightness

folder_path = f'{user_id}_downloaded_images'
image_files = os.listdir(folder_path)

image_names = []
color1_list = []
color2_list = []
color3_list = []
brightness_list = []


for image_file in image_files:
    image_path = os.path.join(folder_path, image_file)
    color1, color2, color3, brightness = analyze_image(image_path)
    
    if color1 is not None and brightness is not None:
        #print(f"Image: {image_file}")
        #print(f"Main colors: {main_colors}")
        #print(f"Brightness: {brightness}\n")
        image_names.append(image_file)
        color1_list.append(color1)
        color2_list.append(color2)
        color3_list.append(color3)
        brightness_list.append(brightness)
    else:
        print(f"Skipping '{image_file}'\n")

In [8]:
df = pd.DataFrame({
    "Img_File_Name": image_names,
    "Color1": color1_list,
    "Color2": color2_list,
    "Color3": color3_list,
    "Brightness": brightness_list
})

df.to_csv(f"{user_id}_image_analysis.csv", index=False)

### Merge the data together and form into one final csv file for the user

In [9]:
#Merging colors and brightness to the user file

# Read in the csv files
df_ratings = pd.read_csv(f'{user_id}_ratings.csv')
df_colors = pd.read_csv(f"{user_id}_image_analysis.csv")

# Merge the two dataframes on the common column
merged_df = pd.merge(df_ratings, df_colors, on='Img_File_Name')

colors_columns = ['Color1', 'Color2', 'Color3', 'Brightness']
ratings_columns = ["UserID",
     "Title",
     "Img_Path",
     "Img_File_Name",
     "Year",
     "Description",
     "Directors",
     "Stars",
     "Viewer_Advisory",
     "Duration",
     "Genre",
     "Votes",
     "Movie_Rating",
     "User_Rating"]

final_merged = merged_df[ratings_columns + colors_columns]

# Select the columns you want to keep and save to a new file
#merged_df = merged_df[['Img_File_Name', 'Color1', 'Color2', 'Color3', 'Brightness']]
final_merged.to_csv(f'{user_id}_img_ratings.csv', index=False, encoding="utf-8-sig")