In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from bs4 import BeautifulSoup
import codecs
import regex as re
import requests
from time import sleep
import time
import json

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait       
from selenium.webdriver.common.by import By       
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common import action_chains

# Define functions

In [27]:
def get_unique_hrefs():
    '''Write a function to get unique links 
    Return unique links and links that have many other links in it'''
    hrefs = []
    href = driver.find_elements_by_xpath("//*[@href]")
    for i in href: 
        try:
            i.get_attribute("href")
            hrefs.append(i.get_attribute("href"))
        except:
            pass

    # Filtering for unique and nested links specific to forbes
    non_vids =[x for x in hrefs if not x.startswith("https://www.forbes.com/video/")]
    non_vids_noamp = [x for x in non_vids if not x.endswith("/amp/")]
    non_advisor =  [x for x in non_vids_noamp if not x.startswith("https://www.forbes.com/advisor/")]
    unique_hrefs = [x for x in non_advisor if x.startswith("https://www.forbes.com/sites/") and len(x) >= 80]
    nested_hrefs = [x for x in non_advisor if x.startswith("https://www.forbes.com/") and len(x) >=30 and len(x) < 80]
    return np.unique(unique_hrefs), np.unique(nested_hrefs)
    
    
# Write a function to get content per article and return all features of interest
def get_content(href):
    '''Function to get content per article and return all features of interest
    # Set up to get views, links, paragraphs,titles,and categories, time, topic for one article'''
    views = []
    links = []
    para = []
    titles = []
    times = []
    topics = []
    num_img = 0
    
    # Get views, if there is view, get all the features
    vi = driver.find_elements_by_xpath("//span[@class='pageviews']")
    for i in vi:
        links.append(href)
        views.append(i.text)
        titles.append(driver.title)
        try:
            topic = driver.find_element_by_xpath("//*[@class='remove-underline']")
            topics.append(topic.text)
        except:
            pass
        
        try:
            imgs = driver.find_elements_by_tag_name("img")
            for i in imgs:
                num_img += 1
        except:
            num_img = 0

        # Get text
        article_p = []
        paragraphs = driver.find_elements_by_tag_name('p')
        for paragraph in paragraphs:
            article_p.append(paragraph.text)
        full_paragraph = "".join(article_p)
        para.append(full_paragraph)
        
        # Get time
        day = []
        t = driver.find_elements_by_tag_name('time')
        for i in t:
            day.append(i.text)
        full_time = " ".join(day)
        times.append(full_time)

    return links, titles, para, views, topics, times, num_img

def add_article(unique_links):
    '''Function to add article to dataframe
    Return the dataframe for those unique links'''
    df = pd.DataFrame(columns=['link', 'title', 'text', 'view', 'topic', 'time', 'num_img'])
    count = 0
    for i, link in enumerate(unique_links):
        try:
            driver.get(link)
            sleep(5)
            try:
                driver.find_elements_by_class_name("pageviews")
                content = get_content(link)
                df.loc[i] = [content[0], content[1], content[2], content[3], content[4], content[5], content[6]]
                if count == 5:
                    print(df.loc[i])
            except:
                pass
        except:
            pass
        count += 1
    return df

def click(num):
    '''Function to click based on num of clicks
    Get specific range of clicks on button to load more articles, wait in-between each click'''
    for i in range(num):
        wait = WebDriverWait(driver, 6)
        try:
            driver.find_element_by_xpath("//*[@class='load-more']")
            element = wait.until(lambda driver: driver.find_element_by_xpath("//*[@class='load-more']"))
            driver.execute_script("arguments[0].click();", element)
            sleep(4)
        except: 
            pass

        #
def get_url(links):
    '''Function to get Internet Archive/Wayback Machine historical articles not current listed on Forbes'''
    lst = []
    for url in links:
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'html.parser')
        for a in soup.find_all('a', href=True):
            lst.append(a['href'][43:])

    non_vids = [x for x in lst if not x.startswith("https://www.forbes.com/video/")]
    non_vids_noamp = [x for x in non_vids if not x.endswith("/amp/")]
    non_advisor =  [x for x in non_vids_noamp if not x.startswith("https://www.forbes.com/advisor/")]
    unique_hrefs = [x for x in non_advisor if x.startswith("https://www.forbes.com/sites/") and len(x) >= 80]
    return np.unique(unique_hrefs)


def clean(df):
    '''Function to clean the dataframe after concatenating all of them before exporting to csv'''
    replace_char = ["[", "]", "'", "EDT", "EST", " views", '"']
    for i in df.columns:
        for r in replace_char:
            df[i] = df[i].str.replace(r, '')
    return df

# Scraping Forbes 
### From main website:

In [None]:
# Ready to scrape
# Setting up path and driver
# Don't run options headless because it will not click, timeout error 
# Make sure to set user data dir or you will not be logged in
# Make sure you've already logged in with username and password, click remembered box
PATH = "C:\chromedriver.exe"
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("start-maximized")
options.add_argument("user-data-dir=C:\\Users\\alice\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path=PATH, options=options)

# Going over subtopics to get content, note most current content 
subtopic_topics = ['green-tech', 'retail', 'energy', 'enterprise-tech',
          'entrepreneurs', 'food-drink', 'healthcare',
          'hollywood-entertainment', 'etfs-mutual-funds', 'fintech', 
          'science', 'hedge-funds-private-equity']

# Create lists for links storage to use later
# unique links are links that lead to actual articles
# nested links are links inside of each article
unique_links = []
nested_links = []

for topic in subtopic_topics:
    url = "https://www.forbes.com/" + str(topic)
    driver.get(url)
    click(9)
        
    # Get all hrefs in that fully loaded site after trying 9 clicks
    # For each topic, save the nest links, to go over later 
    all_hrefs = get_unique_hrefs()
    unique_hrefs_topics = all_hrefs[0]
    nested_hrefs = all_hrefs[1]
    # Remember these links in case site crashes
    unique_links.extend(unique_hrefs_topics)
    nested_links.extend(nested_hrefs)
        
    # Subtopics
    if topic == 'green-tech':
        print('adding green')
        df_green = add_article(unique_hrefs_topics)
    if topic == 'retail':
        print('adding retail')
        df_retail = add_article(unique_hrefs_topics)
    if topic == 'energy':
        print('adding energy')
        df_energy = add_article(unique_hrefs_topics)
    if topic == 'enterprise-tech':
        print('adding enterprise')
        df_enter = add_article(unique_hrefs_topics)
    if topic == 'entrepreneurs':
        print('adding entre')
        df_entre = add_article(unique_hrefs_topics)
    if topic == 'food-drink':
        print('adding food')
        df_food = add_article(unique_hrefs_topics)
    if topic == 'healthcare':
        print('adding healthcare')
        df_health = add_article(unique_hrefs_topics)
    if topic == 'hollywood-entertainment':
        print('adding hollywood')
        df_holly = add_article(unique_hrefs_topics)
    if topic == 'etfs-mutual-funds':
        print('adding funds')
        df_etf = add_article(unique_hrefs_topics)  
    if topic == 'science':
        print('adding science')
        df_science = add_article(unique_hrefs_topics)  
    if topic == 'fintech':
        print('adding fintech')
        df_fin = add_article(unique_hrefs_topics)  
    if topic == 'hedge-funds-private-equity':
        print('adding hedge')
        df_hedge = add_article(unique_hrefs_topics) 
        
# open output file for writing unique and nested_links in case browser crashes
with open('unique_hrefs_topics.txt', 'w') as filehandle:
    json.dump(unique_links, filehandle)

with open('nested_links.txt', 'w') as filehandle:
    json.dump(nested_links, filehandle)
    
# Export in case notebook crash
dfs_sub_w_im = pd.concat([df_green, df_retail, df_energy, df_enter,
                 df_food, df_health,df_holly,
                df_etf, df_science, df_fin, 
                 df_hedge])
                
dfs_sub_w_im.to_csv("dfs_sub_w_im.csv")

In [None]:
# Going over main topics and getting articles
PATH = "C:\chromedriver.exe"
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("start-maximized")
options.add_argument("user-data-dir=C:\\Users\\alice\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path=PATH, options=options)

# Create lists for links storage to use later
# nested links are links inside of each article
# unique links are links that lead to actual articles
unique_links_2 = []
nested_links_2 = []

main_topic = ['innovation', 'business', 'money', 'leadership', 'lifestyle']
for topic in main_topic:
    url = "https://www.forbes.com/" + str(topic)
    driver.get(url)
    click(9)

    # Get all hrefs in that fully loaded site after trying 9 clicks
    # For each topic, save the nest links, to go over later 
    all_hrefs = get_unique_hrefs()
    unique_hrefs_subtopics = all_hrefs[0]
    nested_hrefs_subtopics = all_hrefs[1]
    unique_links_2.extend(unique_hrefs_subtopics)
    nested_links_2.extend(nested_hrefs_subtopics)
    
         # Main topics
    if topic == 'lifestyle':
        print('adding life')
        df_life = add_article(unique_hrefs_topics) 
    if topic == 'innovation':
        print('adding innovatioin')
        df_inno = add_article(unique_hrefs_topics) 
    if topic == 'business':
        print('adding business')
        df_bus = add_article(unique_hrefs_topics) 
    if topic == 'money':
        print('adding money')
        df_money = add_article(unique_hrefs_topics) 
    if topic == 'leadership':
        print('adding leadership')
        df_lead = add_article(unique_hrefs_topics) 



In [None]:
# Append to text files created and saved above the subtopic links found here
with open('unique_hrefs_topics.txt', 'w') as filehandle:
    filehandle.write(json.dump(unique_links_2))

with open('nested_links.txt', 'w') as filehandle:
    filehandle.write(json.dump(nested_links_2))
    
# Export in case notebook crash
dfs_main_w_img = pd.concat([df_inno, df_bus,
                df_money, df_lead, df_life], ignore_index=True)
dfs_main_w_img.to_csv("dfs_main_w_img.csv")

### Scraping from nested links found 

In [3]:
# Update main df everytime scrape more
base_df = pd.read_csv("data_3k.csv", index_col=0)

In [None]:
# Handle the nested links gotten from above
PATH = "C:\chromedriver.exe"
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("start-maximized")
options.add_argument("user-data-dir=C:\\Users\\alice\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path=PATH, options=options)


nests= []
with open('nested_links.txt') as f:
    for line in f:
        no_num_img = [line.replace('"', '') for line in line.split(' ')]
    no_num_img_clean = [line.replace(",", '') for line in no_num_img]
    
nested_filter = [x for x in no_num_img_clean if x.startswith("https://www.forbes.com/sites/") and len(x) >= 80]

for link in nested_filter:
    driver.get(link)
    click(9)
    all_hrefs = get_unique_hrefs()
    unique_hrefs = all_hrefs[0]
    # only get content if link doesn't is different than what we got already
    unique = [x for x in unique_hrefs if x not in base_df['link']]
    nest_df = add_article(unique)
nest_df_clean = replace(nest_df)
nest_df_clean.to_csv("nest_df_clean.csv")

In [6]:
# A list of nested url found from earlier webscrape that are specific to authors or a paid program
authors_lifestyle = ['https://www.forbes.com/sites/abigailabesamis/',
 'https://www.forbes.com/sites/alexledsom/',
 'https://www.forbes.com/sites/angelinavillaclarke/',
 'https://www.forbes.com/sites/benbaldanza/',
 'https://www.forbes.com/sites/billroberson/',
 'https://www.forbes.com/sites/brandonschultz/',
 'https://www.forbes.com/sites/brittanyanas/',
 'https://www.forbes.com/sites/carltonreid/',
 'https://www.forbes.com/sites/carolbesler/',
 'https://www.forbes.com/sites/catherinesabino/',
 'https://www.forbes.com/sites/ceciliapelloux/',
 'https://www.forbes.com/sites/ceciliarodriguez/',
 'https://www.forbes.com/sites/cheryltiu/',
 'https://www.forbes.com/sites/chrisobrien/',
 'https://www.forbes.com/sites/christopherelliott/',
 'https://www.forbes.com/sites/chuckbolotin/',
 'https://www.forbes.com/sites/crowe/#488a13636b97',
 'https://www.forbes.com/sites/dalebuss/',
 'https://www.forbes.com/sites/danidiplacido/',
 'https://www.forbes.com/sites/daphneewingchow/',
 'https://www.forbes.com/sites/deloitte/',
 'https://www.forbes.com/sites/emilyprice/',
 'https://www.forbes.com/sites/ericrosen/',
 'https://www.forbes.com/sites/eustaciahuen/',
 'https://www.forbes.com/sites/forbespr/',
 'https://www.forbes.com/sites/geoffreymorrison/',
 'https://www.forbes.com/sites/googlecloud/',
 'https://www.forbes.com/sites/grantmartin/',
 'https://www.forbes.com/sites/honeywell/#168235653d39',
 'https://www.forbes.com/sites/isisbriones/',
 'https://www.forbes.com/sites/jacknerad2/',
 'https://www.forbes.com/sites/jamesmorris/',
 'https://www.forbes.com/sites/jamiewareham/',
 'https://www.forbes.com/sites/japan/',
 'https://www.forbes.com/sites/jaredranahan/',
 'https://www.forbes.com/sites/jasonfogelson/',
 'https://www.forbes.com/sites/jayginsbach/',
 'https://www.forbes.com/sites/jeanneobriencoffey/',
 'https://www.forbes.com/sites/jerylbrunner/',
 'https://www.forbes.com/sites/jimhenry/',
 'https://www.forbes.com/sites/jimrossi/',
 'https://www.forbes.com/sites/johannaread/',
 'https://www.forbes.com/sites/jordilippemcgraw/',
 'https://www.forbes.com/sites/josephdeacetis/',
 'https://www.forbes.com/sites/jumio/',
 'https://www.forbes.com/sites/kaeliconforti/',
 'https://www.forbes.com/sites/katiechang/',
 'https://www.forbes.com/sites/kimwesterman/',
 'https://www.forbes.com/sites/kyleedward/',
 'https://www.forbes.com/sites/lanabortolot/',
 'https://www.forbes.com/sites/lizazimmerman/',
 'https://www.forbes.com/sites/loisaltermark/',
 'https://www.forbes.com/sites/markewing/',
 'https://www.forbes.com/sites/matthewcatellier/',
 'https://www.forbes.com/sites/matthewerskine/',
 'https://www.forbes.com/sites/micheleherrmann/',
 'https://www.forbes.com/sites/michelerobson/',
 'https://www.forbes.com/sites/michellegross/',
 'https://www.forbes.com/sites/michiganeconomicdevelopmentcorporation/',
 'https://www.forbes.com/sites/mikeespindle/',
 'https://www.forbes.com/sites/mitsubishiheavyindustries/',
 'https://www.forbes.com/sites/nancyolson/',
 'https://www.forbes.com/sites/natashagural/',
 'https://www.forbes.com/sites/newyorklifeinvestments/',
 'https://www.forbes.com/sites/nicoletrilivas/',
 'https://www.forbes.com/sites/nikkifrias/',
 'https://www.forbes.com/sites/nomanazish/',
 'https://www.forbes.com/sites/officedepotofficemax/#3ec2327d5512',
 'https://www.forbes.com/sites/peterlyon/',
 'https://www.forbes.com/sites/rachelingram/',
 'https://www.forbes.com/sites/ramseyqubein/',
 'https://www.forbes.com/sites/rebeccahughes/',
 'https://www.forbes.com/sites/roberthart/',
 'https://www.forbes.com/sites/roberthoban/',
 'https://www.forbes.com/sites/robinraven/',
 'https://www.forbes.com/sites/rogersands/',
 'https://www.forbes.com/sites/salesforce/',
 'https://www.forbes.com/sites/sandramacgregor/',
 'https://www.forbes.com/sites/sap/',
 'https://www.forbes.com/sites/sarahturner/',
 'https://www.forbes.com/sites/scottkramer/',
 'https://www.forbes.com/sites/servicenow/',
 'https://www.forbes.com/sites/square/',
 'https://www.forbes.com/sites/stephanrabimov/',
 'https://www.forbes.com/sites/stevebaltin/',
 'https://www.forbes.com/sites/suzannerowankelleher/',
 'https://www.forbes.com/sites/suziedundas/',
 'https://www.forbes.com/sites/tanyaklich/',
 'https://www.forbes.com/sites/tanyamohn/',
 'https://www.forbes.com/sites/tmobile/',
 'https://www.forbes.com/sites/westernbonime/',
 'https://www.forbes.com/sites/willmcgough/',
 'https://www.forbes.com/sites/willyakowicz/']

In [None]:
PATH = "C:\chromedriver.exe"
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("start-maximized")
options.add_argument("user-data-dir=C:\\Users\\alice\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path=PATH, options=options)

for url in authors_lifestyle[47:]:
    driver.get(url)
    click(4)
    all_hrefs = get_unique_hrefs()
    unique_hrefs = all_hrefs[0]
    unique_hrefs_filter = [x for x in unique_hrefs if x not in base_df['link']]
    df_author3 = add_article(unique_hrefs_filter) 
    driver.quit()
df_author3.to_csv('df_author3.csv')

In [None]:
# Get num_img for those links we got without
no_num_img = []
with open('no_num_img.txt') as f:
    for line in f:
        no_num_img = [line.replace('"', '') for line in line.split(' ')]
    no_num_img_clean = [line.replace(",", '') for line in no_num_img]
    
no_num_img_rest = [x for x in no_num_img_clean[38:] if x.startswith("https://www.forbes.com/sites/") and len(x) >= 80]

PATH = "C:\chromedriver.exe"
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("start-maximized")
options.add_argument("user-data-dir=C:\\Users\\alice\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path=PATH, options=options)

start = time.time()
now_num_img_rest = add_article(no_num_img_rest) 
end = time.time() 
print(end-start)
        
now_num_img_rest.to_csv('now_num_img_rest.csv')

In [None]:
driver.quit()

### Scraping from Internet Archive:

In [None]:
# Past urls to Forbes gotten from snapshots of Wayback Machine
urls_2020 = ["https://web.archive.org/web/20200229010432/https://www.forbes.com",
             "https://web.archive.org/web/20200331062207/https://www.forbes.com",
             "https://web.archive.org/web/20200430000533/https://www.forbes.com",
             "https://web.archive.org/web/20200429234821/https://www.forbes.com/",
             "https://web.archive.org/web/20200531011545/https://www.forbes.com",
             "https://web.archive.org/web/20200630013536/https://www.forbes.com",
             "https://web.archive.org/web/20200731033813/https://www.forbes.com",
             "https://web.archive.org/web/20200831015140/https://www.forbes.com",
             "https://web.archive.org/web/20200930014039/https://www.forbes.com",
            "https://web.archive.org/web/20201031001523/https://www.forbes.com"]
urls_leadership = ["https://web.archive.org/web/20200131024636if_/https://www.forbes.com/leadership/#2a7e3e111d66",
                  "https://web.archive.org/web/20200228181820if_/https://www.forbes.com/leadership/#736555471d66",
                  "https://web.archive.org/web/20200331105222if_/https://www.forbes.com/leadership/#79d88fda1d66",
                  "https://web.archive.org/web/20200430223858if_/https://www.forbes.com/leadership/#257e37361d66",
                  "https://web.archive.org/web/20200531163926if_/https://www.forbes.com/leadership/#462a36ad1d66",
                  "https://web.archive.org/web/20200629232036if_/https://www.forbes.com/leadership/#4de1ed181d66",
                  "https://web.archive.org/web/20200730233119if_/https://www.forbes.com/leadership/#29e0d7e91d66",
                  "https://web.archive.org/web/20200831210608if_/https://www.forbes.com/leadership/#4c771921d66d",
                  "https://web.archive.org/web/20200930230049if_/https://www.forbes.com/leadership/#7fdf47781d66",
                  "https://web.archive.org/web/20201031154325if_/https://www.forbes.com/leadership/?sh=3d4013f1d66d",
                  "https://web.archive.org/web/20200229185802if_/https://www.forbes.com/travel/#2e100999463a",
                  "https://web.archive.org/web/20200525032741if_/https://www.forbes.com/travel/#54cf2eee463a",
                  "https://web.archive.org/web/20200627050939if_/https://www.forbes.com/travel/#79de712b463a",
                  "https://web.archive.org/web/20200831062900if_/https://www.forbes.com/travel/#3ae47577463a",
                  "https://web.archive.org/web/20201118053258if_/https://www.forbes.com/travel/?sh=55b790f7463a"]
urls_life = ["https://web.archive.org/web/20200131033836if_/https://www.forbes.com/lifestyle/#343de15022d1",
            "https://web.archive.org/web/20200222143426if_/https://www.forbes.com/lifestyle/#48f2335b22d1",
            "https://web.archive.org/web/20200331122830/https://www.forbes.com/lifestyle/",
            "https://web.archive.org/web/20200430220427if_/https://www.forbes.com/lifestyle/#7a4ea5e422d1",
            "https://web.archive.org/web/20200531163955if_/https://www.forbes.com/lifestyle/#4eeaa90222d1",
            "https://web.archive.org/web/20200630213502if_/https://www.forbes.com/lifestyle/#65a798e022d1",
            "https://web.archive.org/web/20200725151057if_/https://www.forbes.com/lifestyle/#5b6ffcf22d15",
            "https://web.archive.org/web/20200831155136if_/https://www.forbes.com/lifestyle/#47a7765e22d1",
             "https://web.archive.org/web/20200924192151if_/https://www.forbes.com/lifestyle/#5ab0be7f22d1",
            "https://web.archive.org/web/20201031154333if_/https://www.forbes.com/lifestyle/?sh=4f4381a22d15"]
urls_inno = ["https://web.archive.org/web/20200131024046if_/https://www.forbes.com/innovation/#18f8e74d6834",
             "https://web.archive.org/web/20200229205148if_/https://www.forbes.com/innovation/#19682f336834",
             "https://web.archive.org/web/20200331104725if_/https://www.forbes.com/innovation/#60e402d66834",
             "https://web.archive.org/web/20200430233757if_/https://www.forbes.com/innovation/#6e3447146834",
             "https://web.archive.org/web/20200531101716if_/https://www.forbes.com/innovation/#5dbc622d6834",
             "https://web.archive.org/web/20200630225447if_/https://www.forbes.com/innovation/#12a0a3cb6834",
             "https://web.archive.org/web/20200731215241if_/https://www.forbes.com/innovation/#580cee3d6834",
             "https://web.archive.org/web/20200831211231if_/https://www.forbes.com/innovation/#2c0d4c9a6834",
             "https://web.archive.org/web/20200930020213if_/https://www.forbes.com/innovation/#776f0deb6834",
             "https://web.archive.org/web/20201031002146if_/https://www.forbes.com/innovation/?sh=6e583d4f6834"]
urls_busi = ["https://web.archive.org/web/20200131024950if_/https://www.forbes.com/business/#55113751535f",
            "https://web.archive.org/web/20200228181954if_/https://www.forbes.com/business/#9b6d2f2535fd",
            "https://web.archive.org/web/20200331105430if_/https://www.forbes.com/business/#571a8a34535f",
            "https://web.archive.org/web/20200430042859if_/https://www.forbes.com/business/#7df84e7a535f",
            "https://web.archive.org/web/20200630225244if_/https://www.forbes.com/business/#57ced1f2535f",
            "https://web.archive.org/web/20200930053706if_/https://www.forbes.com/business/#26b3ce28535f"]
urls_money = ["https://web.archive.org/web/20200131141807if_/https://www.forbes.com/money/#64875cd5c19a",
             "https://web.archive.org/web/20200229185510if_/https://www.forbes.com/money/#675d6a5fc19a",
             "https://web.archive.org/web/20200331105345if_/https://www.forbes.com/money/#4269cf93c19a",
             "https://web.archive.org/web/20200430223903if_/https://www.forbes.com/money/#537e51bbc19a",
             "https://web.archive.org/web/20200521232924if_/https://www.forbes.com/money/#da1f780c19aa",
             "https://web.archive.org/web/20201030214257if_/https://www.forbes.com/money/?sh=2af6cd5cc19a"]

# Round 2 of Internet Archive
urls_leader_2 = ["https://web.archive.org/web/20200506220747if_/https://www.forbes.com/leadership/#3707d2141d66",
                "https://web.archive.org/web/20200512065447if_/https://www.forbes.com/leadership/#240a1e721d66",
                "https://web.archive.org/web/20200715221933if_/https://www.forbes.com/leadership/#594fc00c1d66"]
urls_inno_2 = ["https://web.archive.org/web/20200113231742if_/https://www.forbes.com/innovation/#4ce2e2326834",
              "https://web.archive.org/web/20200415172056if_/https://www.forbes.com/innovation/#6e0ba5e36834",
              "https://web.archive.org/web/20200519211431if_/https://www.forbes.com/innovation/#59f9bd296834",
              "https://web.archive.org/web/20200716135325if_/https://www.forbes.com/innovation/#996bd676834a",
              "https://web.archive.org/web/20200817143550if_/https://www.forbes.com/innovation/#63f9166a6834",
              "https://web.archive.org/web/20200915192705if_/https://www.forbes.com/innovation/#6d1c38fb6834"]
urls_mon_2 =["https://web.archive.org/web/20200112081514if_/https://www.forbes.com/money/#5f724306c19a",
            "https://web.archive.org/web/20200309224404if_/https://www.forbes.com/money/#35983630c19a",
            "https://web.archive.org/web/20200413205324if_/https://www.forbes.com/money/#6bb17754c19a",
            "https://web.archive.org/web/20200617155844if_/https://www.forbes.com/money/#e811002c19aa",
            "https://web.archive.org/web/20200916150632if_/https://www.forbes.com/money/#4934116ac19a"]

# Round 2 of Internet Archive
urls_lead_3 = ["https://web.archive.org/web/20200121035820if_/https://www.forbes.com/leadership/#7eb056161d66",
                "https://web.archive.org/web/20200708174348if_/https://www.forbes.com/leadership/#1b76d6601d66",
                "https://web.archive.org/web/20200309224256if_/https://www.forbes.com/leadership/#47372e771d66",
                "https://web.archive.org/web/20200907184232if_/https://www.forbes.com/leadership/#1700a1361d66",
                "https://web.archive.org/web/20201009083341if_/https://www.forbes.com/leadership/#543eafc21d66",
                "https://web.archive.org/web/20200808111343if_/https://www.forbes.com/leadership/#7b16ca291d66"]
urls_inno_3 = ["https://web.archive.org/web/20200121210957if_/https://www.forbes.com/innovation/#2180d3a26834",
                "https://web.archive.org/web/20200317022330if_/https://www.forbes.com/innovation/#5abd9f6d6834",
                "https://web.archive.org/web/20200211223738if_/https://www.forbes.com/innovation/#4d66a4746834",
                "https://web.archive.org/web/20200411194717if_/https://www.forbes.com/innovation/#2e98409b6834",
                "https://web.archive.org/web/20200509164059if_/https://www.forbes.com/innovation/#7a0fbaac6834",
                "https://web.archive.org/web/20200617170224if_/https://www.forbes.com/innovation/#3b4b675b6834",
                "https://web.archive.org/web/20200707232627if_/https://www.forbes.com/innovation/#600a29326834"]
urls_life_3 = ["https://web.archive.org/web/20200308213146if_/https://www.forbes.com/lifestyle/#59ffa73922d1",
              "https://web.archive.org/web/20200708111958if_/https://www.forbes.com/lifestyle/#13a761df22d1",
              "https://web.archive.org/web/20200809113436if_/https://www.forbes.com/lifestyle/#77e04df222d1",
              "https://web.archive.org/web/20201019013144if_/https://www.forbes.com/lifestyle/#6edbefdf22d1"]
urls_mon_3 = ["https://web.archive.org/web/20200609170822if_/https://www.forbes.com/money/#3f1df169c19a",
              "https://web.archive.org/web/20200810004310if_/https://www.forbes.com/money/#32623da8c19a",
              "https://web.archive.org/web/20200907170418if_/https://www.forbes.com/money/#7256ab5ac19a",
              "https://web.archive.org/web/20200809121055if_/https://www.forbes.com/money/#6a5190a2c19a"]
urls_busi_3 = ["https://web.archive.org/web/20200715072547if_/https://www.forbes.com/business/#5bcd898f535f",
              "https://web.archive.org/web/20201019031733if_/https://www.forbes.com/business/#6b3cdeac535f",
              "https://web.archive.org/web/20200818010208if_/https://www.forbes.com/business/#33e67774535f"]

month_inno = ["https://web.archive.org/web/20200101163634if_/https://www.forbes.com/healthcare/#427537b3b4ea",
      "https://web.archive.org/web/20200115114752if_/https://www.forbes.com/healthcare/#2f880e903b4e",
       "https://web.archive.org/web/20200201025524if_/https://www.forbes.com/healthcare/#1e6cabe53b4e",
       "https://web.archive.org/web/20200229185129if_/https://www.forbes.com/healthcare/#2aea0bd13b4e",
      "https://web.archive.org/web/20200304203446if_/https://www.forbes.com/healthcare/#2b820a653b4e",
      "https://web.archive.org/web/20200325201125if_/https://www.forbes.com/healthcare/#142333f3b4ea",
      "https://web.archive.org/web/20200426232950if_/https://www.forbes.com/healthcare/#78e35e8f3b4e",
      "https://web.archive.org/web/20200517113608if_/https://www.forbes.com/healthcare/#181b7f3b3b4e",
      "https://web.archive.org/web/20200531163827if_/https://www.forbes.com/healthcare/#749954323b4e",
      "https://web.archive.org/web/20200629182743if_/https://www.forbes.com/healthcare/#5d35c5983b4e",
      "https://web.archive.org/web/20200701160049if_/https://www.forbes.com/healthcare/#578c021f3b4e",
      "https://web.archive.org/web/20200418082456if_/https://www.forbes.com/healthcare/#5a84029c3b4e",
      "https://web.archive.org/web/20200517113608if_/https://www.forbes.com/healthcare/#181b79ca3b4e",
      "https://web.archive.org/web/20200606081749if_/https://www.forbes.com/healthcare/#751521f43b4e",
      "https://web.archive.org/web/20200116053443if_/https://20200116053443/forbes.com/green-tech/#18f8c19f48d5",
      "https://web.archive.org/web/20200229185128if_/https://20200229185128/forbes.com/green-tech/#711a4895337d",
      "https://web.archive.org/web/20200329095648if_/https://20200329095648/forbes.com/green-tech/#929096e289ea",
      "https://web.archive.org/web/20200426232947if_/https://www.forbes.com/green-tech/#76e35d4a1f6c",
      "https://web.archive.org/web/20200323164231if_/https://20200323164231/forbes.com/green-tech/#54363c073bb4",
      "https://web.archive.org/web/20200416050347if_/https://20200416050347/forbes.com/green-tech/#133f315f37e4",
      "https://web.archive.org/web/20200416050347if_/https://20200416050347/forbes.com/green-tech/#133f315f37e4",
       "https://web.archive.org/web/20200524232939if_/https://www.forbes.com/green-tech/#38f540101f6c",
        "https://web.archive.org/web/20200602162152if_/https://www.forbes.com/green-tech/#7426daba1f6c"]

month_lead = ["https://web.archive.org/web/20200113230000if_/https://www.forbes.com/worlds-billionaires/#4192b5b25864",
      "https://web.archive.org/web/20200307211918if_/https://www.forbes.com/worlds-billionaires/#7f9de71e5864",
      "https://web.archive.org/web/20200413202739if_/https://www.forbes.com/worlds-billionaires/#6f2a081e5864",
      "https://web.archive.org/web/20200713090633if_/https://www.forbes.com/worlds-billionaires/#154d06ee5864",
      "https://web.archive.org/web/20200331103200if_/https://www.forbes.com/worlds-billionaires/#2975e3605864",
      "https://web.archive.org/web/20200519010416if_/https://www.forbes.com/worlds-billionaires/#45f5bcaa5864",
      "https://web.archive.org/web/20200609170941if_/https://www.forbes.com/worlds-billionaires/#488f25475864",
      "https://web.archive.org/web/20200629194739if_/https://www.forbes.com/worlds-billionaires/#630ef1695864",
      "https://web.archive.org/web/20200815200153if_/https://www.forbes.com/worlds-billionaires/#7016ebe25864",
      "https://web.archive.org/web/20200914002404if_/https://www.forbes.com/worlds-billionaires/#310fb3cc5864",
      "https://web.archive.org/web/20200327143404if_/https://www.forbes.com/worlds-billionaires/#2f0a0d735864",
      "https://web.archive.org/web/20200108181506if_/https://www.forbes.com/fintech/#fee068513f1a",
      "https://web.archive.org/web/20200328095309if_/https://www.forbes.com/fintech/#610f3a1613f1",
      "https://web.archive.org/web/20200430025831if_/https://www.forbes.com/fintech/#2cb57c9613f1",
      "https://web.archive.org/web/20200412192957if_/https://www.forbes.com/fintech/#6dbee02c13f1",
      "https://web.archive.org/web/20200512122602if_/https://www.forbes.com/fintech/#58395dfa13f1",
      "https://web.archive.org/web/20200609051935if_/https://www.forbes.com/fintech/#69541d313f1a",
      "https://web.archive.org/web/20200701050146if_/https://www.forbes.com/fintech/#7790dc8013f1",
      "https://web.archive.org/web/20200803204133if_/https://www.forbes.com/fintech/#441f45d013f1",
      "https://web.archive.org/web/20200902213205if_/https://www.forbes.com/fintech/#600be60b13f1"]
    
month_1 = ["https://web.archive.org/web/20200112224558if_/https://www.forbes.com/spirits/#22cf82eaa74f",
      "https://web.archive.org/web/20200129011732if_/https://www.forbes.com/spirits/#612ffbb5a74f",
      "https://web.archive.org/web/20200229205231if_/https://www.forbes.com/spirits/#60d8f6bca74f",
      "https://web.archive.org/web/20200309224609if_/https://www.forbes.com/spirits/#1bca284fa74f",
      "https://web.archive.org/web/20200331105636if_/https://www.forbes.com/spirits/#76fc7406a74f",
      "https://web.archive.org/web/20200401204137if_/https://www.forbes.com/spirits/#524a6780a74f",
      "https://web.archive.org/web/20200430102421if_/https://www.forbes.com/spirits/#30adafe2a74f",
      "https://web.archive.org/web/20200531192553if_/https://www.forbes.com/spirits/#e29f94a74f41",
      "https://web.archive.org/web/20200518124221if_/https://www.forbes.com/spirits/#417e7777a74f",
      "https://web.archive.org/web/20200723235644if_/https://www.forbes.com/spirits/#474ba28ca74f",
      "https://web.archive.org/web/20200112224425if_/https://www.forbes.com/manufacturing/#657e126460c1",
      "https://web.archive.org/web/20200605014911if_/https://www.forbes.com/manufacturing/#1fbaf18160c1",
      "https://web.archive.org/web/20200629062104if_/https://www.forbes.com/manufacturing/#372c7b8860c1",
      "https://web.archive.org/web/20200902004342if_/https://www.forbes.com/manufacturing/#135500fa60c1",
      "https://web.archive.org/web/20200109164436if_/https://www.forbes.com/education/#2a51860261de"]

month_2 = ["https://web.archive.org/web/20200225173746if_/https://www.forbes.com/education/#79ad198161de",
      "https://web.archive.org/web/20200303201242if_/https://www.forbes.com/education/#5547756361de",
      "https://web.archive.org/web/20200522171031if_/https://www.forbes.com/education/#552d743761de",
      "https://web.archive.org/web/20200701203622if_/https://www.forbes.com/education/#6f983f6361de",
      "https://web.archive.org/web/20200617011834if_/https://www.forbes.com/education/#2b3b421461de",
      "https://web.archive.org/web/20200815183533if_/https://www.forbes.com/education/#17f7d22061de",
      "https://web.archive.org/web/20200112161256if_/https://www.forbes.com/policy/#5c07a5eb2c48",
      "https://web.archive.org/web/20200130020240if_/https://www.forbes.com/policy/#1d8faa232c48",
      "https://web.archive.org/web/20200229185629if_/https://www.forbes.com/policy/#70ce9a782c48",
      "https://web.archive.org/web/20200216032426if_/https://www.forbes.com/policy/#7836a9372c48",
      "https://web.archive.org/web/20200115114726if_/https://www.forbes.com/diversity-inclusion/#73b7b34b46d2",
      "https://web.archive.org/web/20200201025917if_/https://www.forbes.com/diversity-inclusion/#62b034e846d2",
      "https://web.archive.org/web/20200303201241if_/https://www.forbes.com/diversity-inclusion/#7f477b9846d2",
      "https://web.archive.org/web/20200914033314if_/https://www.forbes.com/diversity-inclusion/#56ccfcec46d2",
      "https://web.archive.org/web/20201007233305if_/https://www.forbes.com/diversity-inclusion/#67d970d546d2"]

month_3 = ["https://web.archive.org/web/20200229185436if_/https://www.forbes.com/diversity-inclusion/#7b9cdf7046d2",
      "https://web.archive.org/web/20200522080751if_/https://www.forbes.com/diversity-inclusion/#755c9f8046d2",
      "https://web.archive.org/web/20200630013207if_/https://www.forbes.com/diversity-inclusion/#779a547246d2",
      "https://web.archive.org/web/20200820083312if_/https://www.forbes.com/diversity-inclusion/#72402ce146d2",
      "https://web.archive.org/web/20200225155617if_/https://www.forbes.com/diversity-inclusion/#3710436746d2",
      "https://web.archive.org/web/20200229205132if_/https://www.forbes.com/big-data/#3987ef3216d1",
      "https://web.archive.org/web/20200304203151if_/https://www.forbes.com/big-data/#62cf5ebd16d1",
      "https://web.archive.org/web/20200115020109if_/https://www.forbes.com/big-data/#28aeddd916d1",
      "https://web.archive.org/web/20200526131936if_/https://www.forbes.com/big-data/#4123767e16d1",
      "https://web.archive.org/web/20200316015017if_/https://www.forbes.com/big-data/#6078e21b16d1",
      "https://web.archive.org/web/20200227173654if_/https://www.forbes.com/careers/#49690ac61d29",
      "https://web.archive.org/web/20200202034301if_/https://www.forbes.com/careers/#67be98e81d29",
      "https://web.archive.org/web/20200110200537if_/https://www.forbes.com/careers/#29ffec641d29",
      "https://web.archive.org/web/20200728041350if_/https://www.forbes.com/careers/#641071411d29"]
month_4 = ["https://web.archive.org/web/20200914191504if_/https://www.forbes.com/careers/#46aad4501d29",
      "https://web.archive.org/web/20200227173654if_/https://www.forbes.com/careers/#5e9903f41d29",
      "https://web.archive.org/web/20200110200537if_/https://www.forbes.com/careers/#15dfebe01d29",
      "https://web.archive.org/web/20200225155518if_/https://www.forbes.com/social-media/#658f4c5c410f",
      "https://web.archive.org/web/20200723235633if_/https://www.forbes.com/social-media/#156b745e410f",
      "https://web.archive.org/web/20200621214840if_/https://www.forbes.com/social-media/#2f9ad99a410f",
      "https://web.archive.org/web/20200803204115if_/https://www.forbes.com/social-media/#69be9cad410f",
      "https://web.archive.org/web/20200111215904if_/https://www.forbes.com/social-media/#4b2e2bd8410f",
      "https://web.archive.org/web/20200822163320if_/https://www.forbes.com/social-media/#42479c1410fd",
      "https://web.archive.org/web/20200303203555if_/https://www.forbes.com/social-media/#415cb621410f",
      "https://web.archive.org/web/20200206210612if_/https://www.forbes.com/arts/#14432ffd1b64",
      "https://web.archive.org/web/20200125232831if_/https://www.forbes.com/arts/#6ab916941b64",
      "https://web.archive.org/web/20200722202132if_/https://www.forbes.com/arts/#6bb04e3a1b64",
      "https://web.archive.org/web/20200521035918if_/https://www.forbes.com/arts/#2e72aea01b64",
      "https://web.archive.org/web/20200818003735if_/https://www.forbes.com/arts/#5a7009821b64"]

jul = ["https://web.archive.org/web/20200922013405if_/https://www.forbes.com/forbeswomen/#6f52687b621e",
      "https://web.archive.org/web/20200907170555if_/https://www.forbes.com/forbeswomen/#a1852b4621ee",
      "https://web.archive.org/web/20201001062204if_/https://www.forbes.com/forbeswomen/#6da35d0e621e",
      "https://web.archive.org/web/20200819190630if_/https://www.forbes.com/forbeswomen/#3f8dad08621e",
      "https://web.archive.org/web/20200728041051if_/https://www.forbes.com/forbeswomen/#435dd23f621e",
      "https://web.archive.org/web/20200914133406if_/https://www.forbes.com/sportsmoney/#6612b5547ff6",
      "https://web.archive.org/web/20201019162747if_/https://www.forbes.com/sportsmoney/#3b20c41b7ff6",
      "https://web.archive.org/web/20200426233554if_/https://www.forbes.com/sportsmoney/#3e48e75f7ff6",
      "https://web.archive.org/web/20200829212139if_/https://www.forbes.com/sportsmoney/#10b902167ff6",
      "https://web.archive.org/web/20200504044328if_/https://www.forbes.com/sportsmoney/#449f04117ff6",
      "https://web.archive.org/web/20200911082956if_/https://www.forbes.com/sportsmoney/#41c930557ff6",
      "https://web.archive.org/web/20200528025748if_/https://www.forbes.com/sportsmoney/#6f6717467ff6",
      "https://web.archive.org/web/20200319040254if_/https://www.forbes.com/sportsmoney/#679560927ff6",
      "https://web.archive.org/web/20200305060928if_/https://www.forbes.com/hollywood-entertainment/#68403c615699"]

aug = ["https://web.archive.org/web/20200425062356if_/https://www.forbes.com/hollywood-entertainment/#4c41c1155699",
      "https://web.archive.org/web/20200305060928if_/https://www.forbes.com/hollywood-entertainment/#7f902baf5699",
      "https://web.archive.org/web/20200922002243if_/https://www.forbes.com/hollywood-entertainment/#7e61245c5699",
      "https://web.archive.org/web/20200912012831if_/https://www.forbes.com/hollywood-entertainment/#229dddeb5699",
      "https://web.archive.org/web/20200723014258if_/https://www.forbes.com/hollywood-entertainment/#462681ae5699",
      "https://web.archive.org/web/20200603125131if_/https://www.forbes.com/hollywood-entertainment/#7cdcc0c65699",
      "https://web.archive.org/web/20200518080303if_/https://www.forbes.com/hollywood-entertainment/#150ece9c5699",
      "https://web.archive.org/web/20201029205427if_/https://www.forbes.com/hollywood-entertainment/?sh=1f1428fa5699",
      "https://web.archive.org/web/20200506101947if_/https://www.forbes.com/hollywood-entertainment/#41ffae175699",
      "https://web.archive.org/web/20200809025704if_/https://www.forbes.com/hollywood-entertainment/#4d668d305699",
      "https://web.archive.org/web/20200915025341if_/https://www.forbes.com/personal-finance/#39aef548ebb0",
      "https://web.archive.org/web/20201011023817if_/https://www.forbes.com/personal-finance/#13061d88ebb0",
      "https://web.archive.org/web/20200916090741if_/https://www.forbes.com/personal-finance/#113bbeeaebb0",
      "https://web.archive.org/web/20200524164953if_/https://www.forbes.com/personal-finance/#678789b2ebb0"]

sep = ["https://web.archive.org/web/20200806172939if_/https://www.forbes.com/personal-finance/#12d24c1febb0",
      "https://web.archive.org/web/20200522174457if_/https://www.forbes.com/forbes-travel-guide/#66ccf1674700",
      "https://web.archive.org/web/20200914175213if_/https://www.forbes.com/forbes-travel-guide/#5c3f253d4700",
      "https://web.archive.org/web/20201029034206if_/https://www.forbes.com/forbes-travel-guide/#2b7323e84700",
      "https://web.archive.org/web/20200522110747if_/https://www.forbes.com/forbes-travel-guide/#7821543f4700",
      "https://web.archive.org/web/20200701035549if_/https://www.forbes.com/forbes-travel-guide/#5ad443c94700",
      "https://web.archive.org/web/20200818131114if_/https://www.forbes.com/forbes-travel-guide/#67d202444700",
      "https://web.archive.org/web/20201019023306if_/https://www.forbes.com/forbes-travel-guide/#1da449d64700",
      "https://web.archive.org/web/20200913210440if_/https://www.forbes.com/forbes-travel-guide/#2988e1324700",
      "https://web.archive.org/web/20200915174427if_/https://www.forbes.com/leadership-strategy/#304e478c65ab",
      "https://web.archive.org/web/20200914191359if_/https://www.forbes.com/leadership-strategy/#1109dcee65ab",
      "https://web.archive.org/web/20200912201754if_/https://www.forbes.com/leadership-strategy/#4877c10765ab",
      "https://web.archive.org/web/20200225155645if_/https://www.forbes.com/leadership-strategy/#1ee0ae4465ab"]
jan1 = ["https://web.archive.org/web/20200110200710if_/https://www.forbes.com/leadership-strategy/#6751666465ab",
       "https://web.archive.org/web/20200403035428/https://www.forbes.com/leadership-strategy/",
       "https://web.archive.org/web/20200704123844if_/https://www.forbes.com/leadership-strategy/#7486137d65ab",
        "https://web.archive.org/web/20200523094125if_/https://www.forbes.com/leadership-strategy/#6268a9d865ab",
       "https://web.archive.org/web/20200604053621if_/https://www.forbes.com/leadership-strategy/#7ac49f7865ab",
       "https://web.archive.org/web/20200915174342if_/https://www.forbes.com/style-beauty/#58bd9f8e3ef7",
       "https://web.archive.org/web/20200903040953if_/https://www.forbes.com/style-beauty/#34d822083ef7",
       "https://web.archive.org/web/20200523050641if_/https://www.forbes.com/style-beauty/#730d207d3ef7",
       "https://web.archive.org/web/20200826043047if_/https://www.forbes.com/style-beauty/#323862f53ef7",
       "https://web.archive.org/web/20200604053550if_/https://www.forbes.com/style-beauty/#10f4284e3ef7",
       "https://web.archive.org/web/20200418194009if_/https://www.forbes.com/style-beauty/#453e39863ef7",
       "https://web.archive.org/web/20200229205143if_/https://www.forbes.com/aerospace-defense/#6b782f0b2d3f",
       "https://web.archive.org/web/20200131024952if_/https://www.forbes.com/aerospace-defense/#1013b682d3f6",
       "https://web.archive.org/web/20200721190445if_/https://www.forbes.com/aerospace-defense/#4ad391902d3f",
       "https://web.archive.org/web/20200817020628if_/https://www.forbes.com/aerospace-defense/#34bb21b92d3f"]

feb2 = ["https://web.archive.org/web/20200302194805if_/https://www.forbes.com/aerospace-defense/#1a1a9cc82d3f",
       "https://web.archive.org/web/20200329100118if_/https://www.forbes.com/aerospace-defense/#3b3d3a192d3f",
       "https://web.archive.org/web/20201029174344if_/https://www.forbes.com/aerospace-defense/#79257cfa2d3f",
       "https://web.archive.org/web/20201030174436if_/https://www.forbes.com/aerospace-defense/?sh=66ac9efd2d3f",
       "https://web.archive.org/web/20200919224045if_/https://www.forbes.com/aerospace-defense/#7da70e1f2d3f",
       "https://web.archive.org/web/20200920005007if_/https://www.forbes.com/aerospace-defense/#6e7d73fe2d3f",
        "https://web.archive.org/web/20200609033326if_/https://www.forbes.com/aerospace-defense/#1be3d59b2d3f",
       "https://web.archive.org/web/20200521021645if_/https://www.forbes.com/aerospace-defense/#5454d31c2d3f",
       "https://web.archive.org/web/20200818003417if_/https://www.forbes.com/aerospace-defense/#65dd01f42d3f",
       "https://web.archive.org/web/20200409190537if_/https://www.forbes.com/aerospace-defense/#390560472d3f",
       "https://web.archive.org/web/20200904141325if_/https://www.forbes.com/aerospace-defense/#13873c542d3f",
       "https://web.archive.org/web/20200914211153if_/https://www.forbes.com/consumer-tech/#4055fdb476c6",
       "https://web.archive.org/web/20201030203715if_/https://www.forbes.com/consumer-tech/?sh=4c8ab5ad76c6",
       "https://web.archive.org/web/20200722114616if_/https://www.forbes.com/consumer-tech/#7038867076c6",
       "https://web.archive.org/web/20201019193542if_/https://www.forbes.com/consumer-tech/#5e1c704176c6"]

mar3 = ["https://web.archive.org/web/20200922002053if_/https://www.forbes.com/consumer-tech/#2a1f9e6576c6",
       "https://web.archive.org/web/20200512225310if_/https://www.forbes.com/consumer-tech/#73d7862376c6",
       "https://web.archive.org/web/20200623005745if_/https://www.forbes.com/consumer-tech/#2c6e534776c6",
        "https://web.archive.org/web/20200223144906if_/https://www.forbes.com/consumer-tech/#7a66038876c6",
       "https://web.archive.org/web/20200304203355if_/https://www.forbes.com/consumer-tech/#9e16d4b76c6d",
       "https://web.archive.org/web/20200317022559if_/https://www.forbes.com/consumer-tech/#6cb0071576c6",
       "https://web.archive.org/web/20200606061918if_/https://www.forbes.com/consumer-tech/#4dc8b14776c6",
       "https://web.archive.org/web/20200125232622if_/https://www.forbes.com/transportation/#15a728e710b3",
       "https://web.archive.org/web/20201030030458if_/https://www.forbes.com/transportation/?sh=2a076a3710b3",
       "https://web.archive.org/web/20200724014918if_/https://www.forbes.com/transportation/#322b2b710b3c",
       "https://web.archive.org/web/20200529223126if_/https://www.forbes.com/transportation/#1fc7e810b3c1",
       "https://web.archive.org/web/20200924192253if_/https://www.forbes.com/transportation/#5fb1c50e10b3",
       "https://web.archive.org/web/20201017094024if_/https://www.forbes.com/transportation/#654ed94410b3",
       "https://web.archive.org/web/20200818004038if_/https://www.forbes.com/transportation/#893322f10b3c"]

apr4 = ["https://web.archive.org/web/20200321045503if_/https://www.forbes.com/transportation/#26a1e48910b3",
       "https://web.archive.org/web/20200229185646if_/https://www.forbes.com/transportation/#26af09f610b3",
       "https://web.archive.org/web/20200318031612if_/https://www.forbes.com/transportation/#17d464aa10b3",
       "https://web.archive.org/web/20200715074644if_/https://www.forbes.com/transportation/#1a90ae1910b3"]


In [None]:
# Get all the links from those above snapshot links
# First round
url_lead_3 = get_url(urls_lead_3)
url_inno_3 = get_url(urls_inno_3)
url_mon_3 = get_url(urls_mon_2)
url_life_3 = get_url(urls_life_3)
url_busi_3 = get_url(urls_busi_3)
url_2020_lst = get_url(urls_2020)
urls_leadership_lst = get_url(urls_leadership)
urls_life_lst = get_url(urls_life)
urls_inno = get_url(urls_inno)
urls_busi = get_url(urls_busi)
urls_money = get_url(urls_money)

# Second round
url_lead_2 = get_url(urls_leader_2)
url_inno_2 = get_url(urls_inno_2)
url_mon_2 = get_url(urls_mon_2)

# Third round
month_first = get_url(month_inno)
mont_sec = get_url(month_lead)\
month_third = get_url(month_1)
month_fourth = get_url(month_2)
month_fifth = get_url(month_3)
month_sixth = get_url(month_4)
jun_links = get_url(jul)
aug_links = get_url(aug)
sep_links = get_url(sep)

In [None]:
master_urls = []
for i in [url_lead_3, url_inno_3,url_mon_3, url_life_3,url_busi_3, url_2020_lst, urls_leadership_lst,urls_life_lst, 
          urls_inno, urls_busi, urls_money,url_lead_2, url_inno_2, url_mon_2, month_first, mont_sec, month_third,
         month_fourth, month_fifth, month_sixth, jun_links, aug_links, sep_links]:
    master_urls.extend(i)

In [None]:
master_urls_save = pd.DataFrame(master_urls)
master_urls_save.columns = ['link']
master_urls_save.to_csv("master_urls.csv")

In [21]:
archive = pd.read_csv("master_urls.csv")
arc_links = archive['link']

In [15]:
PATH = "C:\chromedriver.exe"
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("start-maximized")
options.add_argument("user-data-dir=C:\\Users\\alice\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path=PATH, options=options)
arc_df = add_article(arc_links)
arc_df.to_csv("archive_art.csv")

link       [https://www.forbes.com/sites/alanohnsman/2020...
title      [Tesla’s Effort To Short-Circuit Nikola Lawsui...
text       [TrucksNikola, an upstart maker of electric tr...
view                                          [70,847 views]
topic                                       [Transportation]
time                             [Apr 20, 2020, 06:18pm EDT]
num_img                                                   10
Name: 5, dtype: object


## Last clean & save data

In [None]:
# Concat all the dataframes of articles found above
final_data = pd.concat([archive_art, now_num_img_rest, df_author3, dfs_main_w_img, nest_df_clean, dfs_sub_w_im],
                      axis = 0)

# Remove duplicated links or titles on last time
df = df.drop_duplicates(subset='title', keep='last')
df = df.drop_duplicates(subset='link', keep='last')

# Reset index & drop NAs
df = final_data.reset_index().sort_index()
df = df.drop(['index'], axis=1)
df.dropna(how='any', inplace=True)

In [None]:
final_data = df.to_csv("forbes_7k_final.csv", index_col=0)