# BIA-660 Web Mining Project
## Project Title - Movie Recommendation System
### Objective:
We aim to build a movie recommendation system wherein we will understand the users watch history and recommend movies accordingly. Instead of asking users choice of preference we will extract this information from their watching history.
#### Dataset for our project will be collected from website: https://www.allmovie.com
![alt text](https://upload.wikimedia.org/wikipedia/commons/c/c7/Allmovie_Logo.png "Logo Title Text 1")
This website provide comprehensive movie information including reviews, ratings, biographies etc. We will be looking into more granular data while scraping the data set. Apart from genres, we will be considering keywords, themes, reviews which will be used for multilevel scraping for more accurate prediction.



## Step-1: Scraping genres names and its link
* Find element through css selector using BeautifulSoup
* Fetch the name and its corresponding href link and store it in the dictionary
* Iterate through the dictionary items and store in the .csv file (genres.csv)
* Git Link: [genres.csv](https://github.com/athiban94/movie-recommendation/blob/master/genres.csv)


In [None]:
import requests
from bs4 import BeautifulSoup
import csv

headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.'
                          '86 Safari/537.36'}

# site url
siteURL = "https://www.allmovie.com"

if __name__ == "__main__":

    # send a get request to the web page
    page = requests.get("https://www.allmovie.com/genres", headers=headers)

    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')

        # Creating a dictionary to store the genre name as "key" and link as "value"
        genres = {}

        genreList = soup.select("div.genres div.genre h3 a")
        for genre in genreList:
            genres[genre.text] = siteURL + genre['href']

        with open('genres.csv', 'w', newline='') as f:
            writer = csv.writer(f, delimiter=',')
            writer.writerow(['Genre', 'link'])
            for key, value in genres.items():
                writer.writerow([key, value])

    else:
        print("Oops! could not get page, Error: ",page.status_code)


## Step-2 Scraping all movies and its link from each of the category url
* Read genres.csv, fetch category and links column and convert it to dictionary
* Iterate the above dictionary result to fetch all the individual movies within the particular genre.
* Handle pagenation senario wherein a particular genre may have n-number of pages.
* Store the output output in the .csv file with coloumns - [category], [movie-name], [movie-href-url]
* Generated File name: categoryMovieLinksList.csv
* Git Link: 
[categoryMovieLinksList.csv](https://github.com/athiban94/movie-recommendation/blob/master/categoryMovieLinksList.csv)

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.'
                          '86 Safari/537.36'}

# site url
siteURL = "https://www.allmovie.com"
movieCorpusList = []
def scrapePageContents(category, categoryLink):
    print(categoryLink)
    page = requests.get(categoryLink, headers=headers)
    if page.status_code == 200:
        print("succes")
        soup = BeautifulSoup(page.content, 'html.parser')
        pagenationNext = soup.select('div.pagination span.next a')

        moviesList = soup.select("div.movie-highlights div.movie_row div.movie p.title a")
        for movie in moviesList:
            movieTuple = (category, movie.text.strip(), siteURL + movie["href"])
            movieCorpusList.append(movieTuple)
        if(pagenationNext):
            for page in pagenationNext:
                url = siteURL + page["href"]
                scrapePageContents(category, url)
        else:
            print("End of category: " + category)

def generateMovieCateoryTuples(categoryList):
    for category, categoryLink in categoryList.items():
        scrapePageContents(category, categoryLink)

    for element in movieCorpusList:
        print(element)
    return movieCorpusList

def saveMoviesInCSV(movieCorpusList):
    with open('categoryMovieLinksList.csv', 'w') as f:
        writer = csv.writer(f, lineterminator='\n')
        for tup in movieCorpusList:
            writer.writerow(tup)


if __name__ == "__main__":
    data = pd.read_csv("genres.csv", header=0)
    categoryList = dict(zip(data["Genre"].values.tolist(), data["link"].values.tolist()))
    entireMovieList = generateMovieCateoryTuples(categoryList)
    saveMoviesInCSV(entireMovieList)

## Step-3 Scrap all movie contents with movie links
* Read categoryMovieLinksList.csv file fetching all movie links from the column
* Iterate each movie links to get values category, movie, title, rating, genre, subgenre, theme, keyword, releasedate, country, review
* Store the entire the data to the movie10k.csv file
* Check for duplicate rows and remove it
* Link: [movie10k.csv](https://github.com/athiban94/movie-recommendation/blob/master/movie10k.csv)
* Link for sample data set:[movie1k.csv]()

In [None]:

# coding: utf-8

# In[ ]:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import requests                   
from bs4 import BeautifulSoup 
import csv
import re

def extract_source(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    source=requests.get(url, headers=headers).text
    return source

def extract_data(source,category="",mvname=""):
    
    mvdata=dict.fromkeys(["category","movie","title","rating","genre","subgenre","theme","keyword","releasedate","country","review"],"")
    #print(mvdata)
    mvdata["category"]=category
    mvdata["movie"]=mvname
    
    soup=BeautifulSoup(source, 'html.parser')
    
    #Title
    names=soup.find('meta',property ="og:title" )
    title=names["content"] if names else "No meta title given"
    mvdata["title"]=title.split("-")[0].strip()
    #print(title)
    
    #Rating
    rating_par = soup.findAll('div', attrs={'itemprop':'ratingValue'})
    if(rating_par!=None and len(rating_par)>0):
        rating=rating_par[0].text.strip()
        mvdata["rating"]=rating
        #print(rating)
    
    #Genre
    genre_par = soup.findAll("span", class_="header-movie-genres")
    gtext= genre_par[0].findAll("a",href=True)
    genre=gtext[0].text
    mvdata["genre"]=genre
    #print(genre)
    
    #Sub_Genre
    sub_genre_par = soup.findAll("span", class_="header-movie-subgenres")
    if(sub_genre_par!=None and len(sub_genre_par)>0):
        stext = sub_genre_par[0].findAll("a",href=True)
        if(stext!=None and len(stext)>0):
            sub_genre=stext[0].text
            mvdata["subgenre"]=sub_genre
            #print(sub_genre)
    
    #Theme
    theme_str = ""
    theme = soup.findAll("div",class_="charactList")
    if(theme!=None and len(theme)>0):
        ttext = theme[0].findAll("a",href=True)
        for themes in ttext:
            theme_str=themes.text+","+theme_str
        theme_str=theme_str.strip(",")
        mvdata["theme"]=theme_str
        #print(theme_str)
    
    #Keyword
    keyword_par = soup.findAll("div",class_="keywords")
    if(keyword_par!=None and len(keyword_par)>0):
        keywords=keyword_par[0].findAll("div",class_="charactList")[0].text.strip()
        #keywords=keyword_par[0]
        mvdata["keyword"]=keywords
    #print(keywords)
    
    #Details
    #date of release
    detail = soup.findAll("hgroup",class_="details")
    for det in detail[0].findAll("span"):
        #print( det.text.split("-")[0].strip(" \n"))
        if det.text.split("-")[0].strip()=="Release Date":
            rel_dte = det.text.split("-")[1].split("(")[0].strip()
            mvdata["releasedate"]=rel_dte
            #print(rel_dte)
    
            #Countries of Release\
        if det.text.split("-")[0].strip()=="Countries":
            cntry=det.text.split("-")[1].strip().strip("|\xa0")
            mvdata["country"]=cntry
            #print(cntry)
    
    #Review
    review_par = soup.findAll("div",itemprop = "description")
    if(review_par!=None and len(review_par)>0):
        review=review_par[0].text.strip()
        mvdata["review"]=review
        #print(review)
    
    return mvdata

url_str1 = 'https://www.allmovie.com/movie/a-star-is-born-v548181'
url_str2 = 'https://www.allmovie.com/movie/the-revenant-v597171'

print(extract_data(extract_source(url_str2)))



# In[ ]:


if __name__ == "__main__":
    with open('movie10k.csv', 'w', newline = '') as f:  # Just use 'w' mode in 3.x
        with open('categoryMovieLinksList.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            cnt=0
            seen = set()
            for row in csv_reader:
                if cnt<11000:
                    print(row)
                    if row[1].strip() not in seen: 
                        seen.add(row[1].strip())
                        mvdict=extract_data(extract_source(row[2]),row[0],row[1])
                        w = csv.writer(f)
                        #w.writeheader()
                        if cnt==0:
                            w.writerow(mvdict.keys())
                        w.writerow(mvdict.values())
                    else:
                        print("Duplicate:::::: ",row[1].strip())
                    cnt=cnt+1



## Step - 4 EDA - Analysis
* Read the movies10k.csv file and fetch the keywords and themes column
* Performed tokenization of keywords and themes of each category
* Generated tf-idf matrix for the same
* Performed lemmatization and removed stop words

In [2]:

# coding: utf-8

# In[16]:


import nltk
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from scipy.spatial import distance


# In[17]:


def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# In[18]:


stop_words = stopwords.words('english')

#adding new stopwords based upon domain knowledge
stop_words.append('nan')

wordnet_lemmatizer = WordNetLemmatizer()

def tokenize(keywords):
    token_count={}
    tokens=[x.strip() for x in keywords.split(",")]
    #print(tokens)
    tag_token=nltk.pos_tag(tokens)
    #print(tag_token)
    lem_tok=[]
    for (w,tag) in tag_token:
        if w not in stop_words:
            lem_tok.append(wordnet_lemmatizer.lemmatize(w, get_wordnet_pos(tag)))
    #print(lem_tok)
    
    token_count=nltk.FreqDist(lem_tok)
        
    return token_count


# In[19]:


def get_top_words(mvdata,n):
    
    topwrds_dict={}
    labels=list(mvdata.keys())
    
    doc_dict={}
    for lb in mvdata.keys():
        doc_dict[lb]=tokenize(mvdata[lb])
    
    #calculate tfidf matrix
    
    #create matrix from dictionary
    dtm=pd.DataFrame.from_dict(doc_dict,orient="index")
    dtm=dtm.fillna(0)
    
    #get normalized term frequency (tf) matrix        
    tf=dtm.values
    doc_len=tf.sum(axis=1)
    tf=np.divide(tf.T, doc_len).T
    
    #get idf
    df=np.where(tf>0,1,0)
    idf=np.log(np.divide(len(mvdata.values()),np.sum(df, axis=0)))+1
    
    #tfidf
    tfidf=normalize(tf*idf)
    #print(tfidf)
    
    smoothed_idf=np.log(np.divide(len(mvdata.values())+1, np.sum(df, axis=0)+1))+1
    #print(smoothed_idf)
    smoothed_tf_idf=normalize(tf*smoothed_idf)
    #print(smoothed_tf_idf)
    
    top=smoothed_tf_idf.argsort()[:,::-1][:,0:n]
    #print(top)
    for idd,row in enumerate(top):
        topwrds_dict[labels[idd]]=[dtm.columns[x] for x in row]
        
    return topwrds_dict


# In[20]:

if __name__ == "__main__":
    mvdf = pd.read_csv("movie10k.csv", header =0,encoding="ISO-8859-1")
    mvkeydict={}
    mvthemedict={}
    for index, row in mvdf.iterrows():
        label=row["category"]
        if label in mvkeydict.keys():
            mvkeydict[label]=mvkeydict[label]+","+str(row["keyword"])
        else:
            mvkeydict[label]=str(row["keyword"])
        if label in mvthemedict.keys():
            mvthemedict[label]=mvthemedict[label]+","+str(row["theme"])
        else:
            mvthemedict[label]=str(row["theme"])   

    #print top n keywords by category
    print("print top keywords by category:")
    print(get_top_words(mvkeydict,3))

    #print top n themes by category
    print("print top themes by category:")
    print(get_top_words(mvthemedict,3))



print top keywords by category:
{'Action': ['Superhero', 'revenge', 'martial-arts'], 'Adult': ['sex', 'bachelorette', 'eroticism'], 'Adventure': ['villain', 'rescue', 'pirate [seafarer]'], 'Avant-garde / Experimental': ['experimental  [arts]', 'filmmaker', 'homosexual'], "Children's/Family": ['friendship', 'animal', 'dog'], 'Comedy': ['love', 'romance', 'friendship'], 'Comedy Drama': ['love', 'friendship', 'romance'], 'Crime': ['gangster', 'organized-crime', 'mob-boss'], 'Drama': ['love', 'family', 'friendship'], 'Epic': ['battle [war]', 'war', 'emperor'], 'Fantasy': ['afterlife', 'love', 'magic'], 'Historical Film': ['war', 'aristocracy', 'king'], 'Horror': ['vampire', 'murder', 'mad-scientist'], 'Musical': ['love', 'performer', 'songwriter'], 'Mystery': ['murder', 'investigation', 'kill'], 'Romance': ['love', 'romance', 'extramarital-affair'], 'Science Fiction': ['alien [not human]', 'future', 'spacecraft'], 'Spy Film': ['espionage', 'agent [representative]', 'spy'], 'Thriller': ['mu