# Web scraping of `F.R.I.E.N.D.S.` Sinetron Comedy

---

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)  
Data source: [**fangj blog**](https://fangj.github.io/friends/)

## Import modules

`%pip freeze > requirements.txt`

In [1]:
# Web scraping
from selenium import webdriver

# Data manipulation
import pandas as pd

# Regular expression
import re
import string

# File management
from pathlib import Path

# Parse JSON
import json

## Load the Chromedriver

Read how to download webdriver for Chrome [**here**]('https://chromedriver.chromium.org/downloads')

In [2]:
# Open the driver
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)

## Access the url

In [3]:
# Main url
url = 'https://fangj.github.io/friends/'

In [4]:
# Access to main url
driver.get(url)

## Core Procedure

### 1 Get URLs for episodes in seasons

In [5]:
# List of season name
list_season = [element.text for element in driver.find_elements_by_tag_name('h3')]

In [6]:
# Dictionary with blank list
dict_season = {key: [] for key in list_season}

In [7]:
dict_season

{'SEASON 1': [],
 'SEASON 2': [],
 'SEASON 3': [],
 'SEASON 4': [],
 'SEASON 5': [],
 'SEASON 6': [],
 'SEASON 7': [],
 'SEASON 8': [],
 'SEASON 9': [],
 'SEASON 10': []}

In [8]:
# Loop each season
for i in range(len(dict_season)):
    # Elements in each season
    episodes = driver.find_elements_by_tag_name('ul')[i].find_elements_by_tag_name('a')
    
    # Loop the episodes in season
    for j in range(len(episodes)):
        # Get episode name and url
        dict_episode = {
            'title': episodes[j].text,
            'url': episodes[j].get_attribute('href')
        }
        
        # Dictionary key
        dict_key = list(dict_season.keys())[i]
        
        # Appen the urls
        dict_season[dict_key].append(dict_episode)

In [9]:
# Show the dictionary
dict_season

{'SEASON 1': [{'title': '101 Monica Gets A Roommate',
   'url': 'https://fangj.github.io/friends/season/0101.html'},
  {'title': '102 The Sonogram At The End',
   'url': 'https://fangj.github.io/friends/season/0102.html'},
  {'title': '103 The Thumb',
   'url': 'https://fangj.github.io/friends/season/0103.html'},
  {'title': '104 George Stephanopoulos',
   'url': 'https://fangj.github.io/friends/season/0104.html'},
  {'title': '105 The East German Laundry Detergent',
   'url': 'https://fangj.github.io/friends/season/0105.html'},
  {'title': '106 The Butt',
   'url': 'https://fangj.github.io/friends/season/0106.html'},
  {'title': '107 The Blackout',
   'url': 'https://fangj.github.io/friends/season/0107.html'},
  {'title': '108 Nana Dies Twice',
   'url': 'https://fangj.github.io/friends/season/0108.html'},
  {'title': '109 Underdog Gets Away',
   'url': 'https://fangj.github.io/friends/season/0109.html'},
  {'title': '110 The Monkey',
   'url': 'https://fangj.github.io/friends/season/

### 2 Access one episode

In [10]:
# Selected season
selected_season = list(dict_season.keys())[0]
# Selected episode
selected_episode = dict_season[selected_season][0]
# URL episode
url_episode = selected_episode['url']

In [11]:
url_episode

'https://fangj.github.io/friends/season/0101.html'

In [12]:
# Access to url
driver.get(url_episode)

### 3 Get metadata in episode

In [13]:
# Metadata
raw_metadata = driver.find_element_by_tag_name('p').text
raw_metadata

'Written by: Marta Kauffman & David Crane\nTranscribed by: guineapig\nAdditional transcribing by: Eric Aasen\n(Note: The previously unseen parts of this episode are shown in blue text.)'

In [14]:
# Written by
try:
    written_by = re.findall(pattern = 'Written by:\s*(.+)', string = raw_metadata)[0]
except:
    written_by = None

# Transcribed by
try:
    transcribed_by = re.findall(pattern = 'Transcribed by:\s*(.+)', string = raw_metadata)[0]
except:
    transcribed_by = None

# Transcribing by
try:
    transcribing_by = re.findall(pattern = 'Additional transcribing by:\s*(.+)', string = raw_metadata)[0]
except:
    transcribing_by = None

# Note
try:
    note = re.findall(pattern = '\\(?Note:\s*(.+)\\)$', string = raw_metadata)[0]
except:
    note = None

In [15]:
# Store the metadata
metadata = {
    'season': selected_season,
    'title': selected_episode['title'],
    'written_by': written_by,
    'transcribed_by': transcribed_by,
    'additional_transcribing_by': transcribing_by,
    'note': note
}

In [16]:
# Show the metadata
metadata

{'season': 'SEASON 1',
 'title': '101 Monica Gets A Roommate',
 'written_by': 'Marta Kauffman & David Crane',
 'transcribed_by': 'guineapig',
 'additional_transcribing_by': 'Eric Aasen',
 'note': 'The previously unseen parts of this episode are shown in blue text.'}

### 4 Get the dialog

In [17]:
# Get the dialog
l = driver.find_elements_by_tag_name('p')

In [18]:
# Create a empty list
list_ = []

# Dialog is not in standard format
if len(l) < 50:
    # Long text
    long_text = []
    
    # Check length of l
    if len(l) > 0 and len(l) != 1:
        # Loop
        for elem in l:
            if len(elem.text) > 50:
                # Get the long text dialog
                long_text_ = elem.text.split('\n')
                # Remove the multiple new lines
                long_text_ = list(filter(lambda x: x != '', long_text_))
                long_text += long_text_
    else:
        # Get the long text dialog
        long_text_ = driver.find_element_by_tag_name('html').text.split('\n')
        # Remove the multiple new lines
        long_text_ = list(filter(lambda x: x != '', long_text_))
        long_text += long_text_

    # Parameters
    stat_long = True
    stat = True
    i = 0
    j = 0

else:
    # Parameters
    stat_long = False
    stat = True
    i = 1
    j = 0

# Main loops
while stat:
    try:
        # String
        if stat_long:
            string_ = long_text[i]
        else:
            string_ = l[i].text
        # Scene play
        try:
            scene_play = re.search(pattern = '\\[Scene.+?[\\]|\\)]', string = string_)
            # String of scene
            scene_text = scene_play.group(0)
        except:
            scene_text = '[Scene: Beginning of Episode (Generated)]'
        # Status of scene
        stat_loop = bool(scene_play)

        # If first index is not scene description
        if not stat_loop and i == 1:
            stat_loop = True
            k = i
        else:
            # First increment
            k = i + 1

        # Dictionary for dialog
        dict_ = {
            'scene_no': j,
            'scene_mark': scene_text,
            'dialog': ''
        }

        # Secondary loops
        while stat_loop:
            try:
                # String
                if stat_long:
                    string_loop = long_text[k]
                else:
                    string_loop = l[k].text
                # Scene play
                scene_play_loop = re.search(pattern = '\\[Scene.+?[\\]|\\)]', string = string_loop)
                # Status of scene
                stat_loop = not bool(scene_play_loop)

                # Update the dialog in dictionary
                if stat_loop:
                    dict_['dialog'] += string_loop + '\n'
                    k += 1
                else:
                    # Scene within dialog
                    if len(string_loop) != len(scene_play_loop.group(0)):
                        string_loop_trunc = re.sub(pattern = '\\[Scene.+?[\\]|\\)]', string = string_loop, repl = '')
                        dict_['dialog'] += string_loop_trunc + '\n'
            except:
                # Stop secondary loop
                stat_loop = False

        # Append the scene in one episode
        list_.append(dict_)

        # Update parameters
        i = k
        j += 1

    except:
        # Stop main loop
        stat = False

In [19]:
# Number of scene in episode
len(list_)

15

In [20]:
# Sample scene
list_[0]

{'scene_no': 0,
 'scene_mark': '[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]',
 'dialog': 'Monica: There\'s nothing to tell! He\'s just some guy I work with!\nJoey: C\'mon, you\'re going out with the guy! There\'s gotta be something wrong with him!\nChandler: All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?\nPhoebe: Wait, does he eat chalk?\n(They all stare, bemused.)\nPhoebe: Just, \'cause, I don\'t want her to go through what I went through with Carl- oh!\nMonica: Okay, everybody relax. This is not even a date. It\'s just two people going out to dinner and- not having sex.\nChandler: Sounds like a date to me.\n[Time Lapse]\nChandler: Alright, so I\'m back in high school, I\'m standing in the middle of the cafeteria, and I realize I am totally naked.\nAll: Oh, yeah. Had that dream.\nChandler: Then I look down, and I realize there\'s a phone... there.\nJoey: Instead of...?\nChandler: That\'s right.\nJoey: Never had that dream.\nPhoebe: N

### 5 Merge metadata and dialog

In [21]:
# File for one episode
episode_ = {
    'metadata': metadata,
    'data': list_
}

In [22]:
# Show the file
episode_

{'metadata': {'season': 'SEASON 1',
  'title': '101 Monica Gets A Roommate',
  'written_by': 'Marta Kauffman & David Crane',
  'transcribed_by': 'guineapig',
  'additional_transcribing_by': 'Eric Aasen',
  'note': 'The previously unseen parts of this episode are shown in blue text.'},
 'data': [{'scene_no': 0,
   'scene_mark': '[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]',
   'dialog': 'Monica: There\'s nothing to tell! He\'s just some guy I work with!\nJoey: C\'mon, you\'re going out with the guy! There\'s gotta be something wrong with him!\nChandler: All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?\nPhoebe: Wait, does he eat chalk?\n(They all stare, bemused.)\nPhoebe: Just, \'cause, I don\'t want her to go through what I went through with Carl- oh!\nMonica: Okay, everybody relax. This is not even a date. It\'s just two people going out to dinner and- not having sex.\nChandler: Sounds like a date to me.\n[Time Lapse]\nChandler: Alright

---

# Get dialogs in all episodes

In [None]:
# Open the driver
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)

# Main url
url = 'https://fangj.github.io/friends/'

# Access to main url
driver.get(url)

# List of season name
list_season = [element.text for element in driver.find_elements_by_tag_name('h3')]

# Dictionary with blank list
dict_season = {key: [] for key in list_season}

# Loop each season
for i in range(len(dict_season)):
    # Elements in each season
    episodes = driver.find_elements_by_tag_name('ul')[i].find_elements_by_tag_name('a')
    
    # Loop the episodes in season
    for j in range(len(episodes)):
        # Get episode name and url
        dict_episode = {
            'title': episodes[j].text,
            'url': episodes[j].get_attribute('href')
        }
        
        # Dictionary key
        dict_key = list(dict_season.keys())[i]
        
        # Appen the urls
        dict_season[dict_key].append(dict_episode)

# Loop episodes
for season in dict_season.keys():
    # Selected season
    selected_season = dict_season[season]
    for l_episode in range(len(selected_season)):
        # Selected episode
        selected_episode = selected_season[l_episode]
        # URL episode
        url_episode = selected_episode['url']
        
        # Access to url
        driver.get(url_episode)
        
        # Get metadata in episode
        try:
            raw_metadata = driver.find_element_by_tag_name('p').text        
        except:
            pass
        # Written by
        try:
            written_by = re.findall(pattern = 'Written by:\s*(.+)', string = raw_metadata)[0]
        except:
            written_by = None
        # Transcribed by
        try:
            transcribed_by = re.findall(pattern = 'Transcribed by:\s*(.+)', string = raw_metadata)[0]
        except:
            transcribed_by = None
        # Transcribing by
        try:
            transcribing_by = re.findall(pattern = 'Additional transcribing by:\s*(.+)', string = raw_metadata)[0]
        except:
            transcribing_by = None
        # Note
        try:
            note = re.findall(pattern = '\\(?Note:\s*(.+)\\)$', string = raw_metadata)[0]
        except:
            note = None

        # Store the metadata
        metadata = {
            'season': season,
            'title': selected_episode['title'],
            'written_by': written_by,
            'transcribed_by': transcribed_by,
            'additional_transcribing_by': transcribing_by,
            'note': note
        }
        
        # Get the dialog
        l = driver.find_elements_by_tag_name('p')

        # Create a empty list
        list_ = []

        # Dialog is not in standard format
        if len(l) < 50:
            # Long text
            long_text = []

            # Check length of l
            if len(l) > 0 and len(l) != 1:
                # Loop
                for elem in l:
                    if len(elem.text) > 50:
                        # Get the long text dialog
                        long_text_ = elem.text.split('\n')
                        # Remove the multiple new lines
                        long_text_ = list(filter(lambda x: x != '', long_text_))
                        long_text += long_text_
            else:
                # Get the long text dialog
                long_text_ = driver.find_element_by_tag_name('html').text.split('\n')
                # Remove the multiple new lines
                long_text_ = list(filter(lambda x: x != '', long_text_))
                long_text += long_text_

            # Parameters
            stat_long = True
            stat = True
            i = 0
            j = 0
        else:
            # Parameters
            stat_long = False
            stat = True
            i = 1
            j = 0

        # Main loops
        while stat:
            try:
                # String
                if stat_long:
                    string_ = long_text[i]
                else:
                    string_ = l[i].text
                # Scene play
                try:
                    scene_play = re.search(pattern = '\\[Scene.+?[\\]|\\)]', string = string_)
                    # String of scene
                    scene_text = scene_play.group(0)
                except:
                    scene_text = '[Scene: Beginning of Episode (Generated)]'
                # Status of scene
                stat_loop = bool(scene_play)

                # If first index is not scene description
                if not stat_loop and i == 1:
                    stat_loop = True
                    k = i
                else:
                    # First increment
                    k = i + 1

                # Dictionary for dialog
                dict_ = {
                    'scene_no': j,
                    'scene_mark': scene_text,
                    'dialog': ''
                }

                # Secondary loops
                while stat_loop:
                    try:
                        # String
                        if stat_long:
                            string_loop = long_text[k]
                        else:
                            string_loop = l[k].text
                        # Scene play
                        scene_play_loop = re.search(pattern = '\\[Scene.+?[\\]|\\)]', string = string_loop)
                        # Status of scene
                        stat_loop = not bool(scene_play_loop)

                        # Update the dialog in dictionary
                        if stat_loop:
                            dict_['dialog'] += string_loop + '\n'
                            k += 1
                        else:
                            # Scene within dialog
                            if len(string_loop) != len(scene_play_loop.group(0)):
                                string_loop_trunc = re.sub(pattern = '\\[Scene.+?[\\]|\\)]', string = string_loop, repl = '')
                                dict_['dialog'] += string_loop_trunc + '\n'
                    except:
                        # Stop secondary loop
                        stat_loop = False

                # Append the scene in one episode
                list_.append(dict_)

                # Update parameters
                i = k
                j += 1

            except:
                # Stop main loop
                stat = False
        
        # File for one episode
        episode_ = {
            'metadata': metadata,
            'data': list_
        }
        
        # Create a directory
        path = '../data/' + season
        Path(path).mkdir(parents = True, exist_ok = True)
        
        # Serialize json
        json_object = json.dumps(episode_, indent = 4)
        # Write to JSON
        file_name = selected_episode['title'].translate(str.maketrans('', '', string.punctuation))
        filename = path + '/{season_name}-{episode_name}.json'.format(
            season_name = season,
            episode_name = file_name
        )
        
        with open(filename, 'w') as outfile:
            outfile.write(json_object)