In [1]:
# Import required libraries
import numpy as np
import pandas as pd

from random import sample 

In [2]:
# Getting rid of scientific-notation
pd.options.display.float_format = '{:.0f}'.format

In [3]:
# Read and convert all csv into dataframes
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')
links = pd.read_csv('links.csv')

# Data Visualisation & Pre-Processing

### movies

In [4]:
movies.sample(5)

Unnamed: 0,movieId,title,genres
3448,4703,Chocolat (1988),Drama
968,1269,Arsenic and Old Lace (1944),Comedy|Mystery|Thriller
9064,142372,Our Brand Is Crisis (2015),Comedy|Drama
7658,88593,"Yellow Sea, The (a.k.a. The Murderer) (Hwangha...",Crime|Drama|Thriller
6737,59141,Son of Rambow (2007),Children|Comedy|Drama


* we see that there are three features or columns in movies 
* movieId, title, genres

* movieID : according to the description mentioned
*** Only movies with at least one rating or tag are included in the dataset.
    These movie ids are consistent with those used on the MovieLens web site
    (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>).
    Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv`
    (i.e., the same id refers to the same movie across these four data files). ***
* So corresponding to each ID we have a url which directs us to a page where info about that movie is present
* So there are a total of 9742 movies with ID's ranging from (1 - 193609)

* Title : It includes the name of the movie along with the year of airing in ()

* genres : It has multiple genres (pipe-separated list)
* According to documentation, 
*** Action, Adventure, Animation, Children's, Comedy, Crime, Documentary, Drama, Fantasy,
    Film-Noir, Horror, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western, (no genres listed) ***

#### I would like to split the column title into two columns title and year

In [5]:
# Splitting function
Title = []
Year = []

def Title_Year(title):
    Title.append(title[:-7])
    Year.append(title[-5:-1:1])

In [6]:
# Splitting to seperate lists
movies['title'].apply(Title_Year)

# Checking
print(sample(Title, 3), sample(Year, 3))

['Bubba Ho-tep', 'Captains Courageous', 'Impossible, The (Imposible, Lo)'] ['1945', '1969', '1966']


In [7]:
# Dropping old title column
movies = movies.drop(columns = 'title')

# Adding two new columns
movies['Title'] = Title
movies['Year'] = Year

In [8]:
# Converter for coverting
# Genre from pipe seperated to a list
def convert(line):
    return line.split('|')

In [9]:
# Conversion
movies['genres'] = movies['genres'].apply(convert)

In [10]:
# Checking
movies['genres'].sample(5)

9166      [Crime, Horror]
2148      [Comedy, Drama]
3672        [Documentary]
5709    [Drama, Thriller]
1329           [Thriller]
Name: genres, dtype: object

In [11]:
# Checking
movies.sample(5)

Unnamed: 0,movieId,genres,Title,Year
699,917,"[Children, Drama]","Little Princess, The",1939
6493,53143,"[Action, Thriller]",Fay Grim,2006
5067,7943,"[Crime, Film-Noir]","Killers, The",1946
3525,4816,[Comedy],Zoolander,2001
3268,4424,[Drama],"Garden of the Finzi-Continis, The (Giardino de...",1970


In [12]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   genres   9742 non-null   object
 2   Title    9742 non-null   object
 3   Year     9742 non-null   object
dtypes: int64(1), object(3)
memory usage: 304.6+ KB


In [13]:
movies.describe()

Unnamed: 0,movieId
count,9742
mean,42200
std,52160
min,1
25%,3248
50%,7300
75%,76232
max,193609


In [14]:
# Amount of missing data
print("Percentage of missing values:")
print(((movies.isna().sum()) / movies.shape[0]) * 100)

Percentage of missing values:
movieId   0
genres    0
Title     0
Year      0
dtype: float64


* No missing data 

### tags

In [15]:
tags.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
3300,567,170945,Suspenseful,1525286501
1816,474,3536,priest,1137181376
2536,477,1089,neo-noir,1242494879
397,62,184471,video game adaptation,1528024898
3327,573,2116,classic,1186588944


In [16]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [17]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,3683,3683,3683
mean,431,27252,1320031967
std,158,43491,172102450
min,2,1,1137179352
25%,424,1262,1137521216
50%,474,4454,1269832564
75%,477,39263,1498456766
max,610,193565,1537098603


* we see that there are four features or columns in tags 
* userId, movieId, tag, timestamp

* userID : according to the description 
*** MovieLens users were selected at random for inclusion. Their ids have been anonymized.
    User ids are consistent between `ratings.csv` and `tags.csv`
    (i.e., the same id refers to the same user across the two files).***
    
* tag : according to the description (Comment)
*** Tags are user-generated metadata about movies.
    Each tag is typically a single word or short phrase.
    The meaning, value, and purpose of a particular tag is determined by each user. ***

* timestamp : according to the description
*** Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. ***

In [18]:
# The timestamp column seems redundant so we can just drop it
tags = tags.drop(columns = 'timestamp')

In [19]:
# Checking
print(tags.columns)

Index(['userId', 'movieId', 'tag'], dtype='object')


In [20]:
# Amount of missing data
print("Percentage of missing values:")
print(((tags.isna().sum()) / tags.shape[0]) * 100)

Percentage of missing values:
userId    0
movieId   0
tag       0
dtype: float64


* No missing data 

### ratings

In [21]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
76073,479,842,2,1039362294
15133,97,3717,4,1043382867
62528,414,529,4,961517293
11010,68,4681,3,1269123495
96373,603,1201,5,963177205


In [22]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [23]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836,100836,100836,100836
mean,326,19435,4,1205946087
std,183,35531,1,216261036
min,1,1,0,828124615
25%,177,1199,3,1019123866
50%,325,2991,4,1186086662
75%,477,8122,4,1435994144
max,610,193609,5,1537799250


* we see that there are four features or columns in ratings 
* userId, movieId, rating, timestamp

* rating : according to the description
*** Ratings are made on a 5-star scale,
    with half-star increments (0.5 stars - 5.0 stars). ***

In [24]:
# The timestamp column seems redundant so we can just drop it
ratings = ratings.drop(columns = 'timestamp')

In [25]:
# Checking
print(ratings.columns)

Index(['userId', 'movieId', 'rating'], dtype='object')


In [26]:
# Amount of missing data
print("Percentage of missing values:")
print(((ratings.isna().sum()) / ratings.shape[0]) * 100)

Percentage of missing values:
userId    0
movieId   0
rating    0
dtype: float64


* No missing data 

### links

In [27]:
links.sample(5)

Unnamed: 0,movieId,imdbId,tmdbId
9542,172637,219263,74535
8741,127180,2044056,125336
7471,82041,1316536,46420
3390,4613,97637,10345
7494,82848,11541,23282


In [28]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [29]:
links.describe()

Unnamed: 0,movieId,imdbId,tmdbId
count,9742,9742,9734
mean,42200,677184,55162
std,52160,1107228,93653
min,1,417,2
25%,3248,95181,9666
50%,7300,167260,16529
75%,76232,805568,44206
max,193609,8391976,525662


* we see that there are four features or columns in links 
* userId, movieId, imdbId, tmdbId

* imdbId : according to the description
*** imdbId is an identifier for movies used by <http://www.imdb.com>.
    E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>. ***
    
* tmdbId : according to the description
*** tmdbId is an identifier for movies used by <https://www.themoviedb.org>.
    E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>. ***

In [30]:
# Amount of missing data
print("Percentage of missing values:")
print(((links.isna().sum()) / links.shape[0]) * 100)

Percentage of missing values:
movieId   0
imdbId    0
tmdbId    0
dtype: float64


* No missing data 

#### Web-Crawling
* Based on the imdbId I would like to add few more features which 
  I get by web-crawling (Automation)
* Features I would like to add are:
* top stars which must include hero and heroine
* Director
* An overview or summary which has useful keywords which will help me
  later in content-collaboration

In [32]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

* We will be using selenium the reasons are mentioned in the report

In [33]:
# Define the path to the Chrome WebDriver executable
webdriver_path = r"C:\Users\rahul\Downloads\chromedriver\chromedriver-win64\chromedriver.exe"

# Create a service object for ChromeDriver
service = Service(webdriver_path)

# Start the WebDriver service
service.start()

# Create a Chrome WebDriver instance
driver = webdriver.Chrome(service = service)

for j in range(0, len(links['imdbId'])):
    # website link
    web = 'https://www.imdb.com/title/tt'
    z = 7 - len(str(links['imdbId'][j]))
    web += '0' * z + str(links['imdbId'][j]) + '/'
    
    # Navigate to the IMDb URL
    driver.get(web)
    
    meta_description = driver.find_elements(by = 'xpath', value =
                                    '//meta[@name="description"]')
    
    description_content = meta_description[0].get_attribute("content")
        
# Close the WebDriver session
driver.quit()

# Stop the WebDriver service
service.stop()

* One thing I want to do here is in the overview instead of getting all the words I want to get keywords only
* Meaning I will try to remove as many stopwords as possible
* Further details are mentioned in report

In [173]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [186]:
# Objectives
Director = []
Stars = []
Keywords = []

In [187]:
# Filling objectives based on the requirements
for j in content:
    first_dot = j.find(" With")
    second_dot = j.find(".", first_dot)
    while(j[second_dot - 2] + j[second_dot - 1] == 'Jr'):
        second_dot = j.find(".", second_dot + 1)
        
    third_dot = j.find(".", second_dot + 1)

    d = j[:first_dot - 1]
    s = j[first_dot + 1:second_dot]
    o = j[second_dot + 2:-2]

    i_ = d.find('Directed by')

    Director.append(d[i_:][12:].split(', '))
    Stars.append(s[5:].split(', '))

    o = o.lower()
    s = ''

    for i in o:
        if(i == ' ' or ord(i) in range(97,123)):
            s += i
        if(i == '-'):
            s += ' '
        
    total_words = s.split()

    k = []
    for each_word in total_words:
        if each_word not in stop_words:
            k.append(each_word)

    Keywords.append(k)

## Merging
* For content based we will only be requiring movies and links along with the scraped data
* So first add three new columns Director, Stars and Keywords in links dataframe
* Later just concat links and movies but drop one of the moviesId as they are arranged in same order and make a new final df dataframe

In [192]:
# Adding scraped data
links['Director'] = Director
links['Stars'] = Stars
links['Keywords'] = Keywords

In [30]:
# Removing moviesId column
del links[links.columns[0]]

In [193]:
# Checking
links.sample(5)

Unnamed: 0,imdbId,tmdbId,Director,Stars,Keywords
658,117918,10478,[Ron Shelton],"[Kevin Costner, Rene Russo, Don Johnson, Cheec...","[washed, golf, pro, working, driving, range, t..."
5333,356618,10145,[Joseph Ruben],"[Julianne Moore, Christopher Kovaleski, Matthe...","[told, children, never, existed, man, woman, s..."
8272,2278871,152584,[Abdellatif Kechiche],"[Léa Seydoux, Adèle Exarchopoulos, Salim Kechi...","[adles, life, changed, meets, emma, young, wom..."
6214,425055,80919,[Michael Hoffman],"[Michael Keaton, Robert Downey Jr., Ari Grayno...","[combining, real, fictional, events, movie, ce..."
7353,1075747,20533,[Jimmy Hayward],"[Josh Brolin, John Malkovich, Megan Fox, Micha...","[us, military, makes, scarred, bounty, hunter,..."


In [194]:
# Final dataframe via concatination
df = pd.concat([movies, links], axis=1)

In [206]:
# Visualisation
print(df)

      movieId                                             genres  \
0           1  [Adventure, Animation, Children, Comedy, Fantasy]   
1           2                     [Adventure, Children, Fantasy]   
2           3                                  [Comedy, Romance]   
3           4                           [Comedy, Drama, Romance]   
4           5                                           [Comedy]   
...       ...                                                ...   
9737   193581               [Action, Animation, Comedy, Fantasy]   
9738   193583                       [Animation, Comedy, Fantasy]   
9739   193585                                            [Drama]   
9740   193587                                [Action, Animation]   
9741   193609                                           [Comedy]   

                                   Title  Year   imdbId  tmdbId  \
0                              Toy Story  1995   114709     862   
1                                Jumanji  1995   