## Final Search Scraper

Author: Miraya Gupta\
Date: 17/04

In [22]:
import requests 
import selenium
from seleniumbase import Driver
from selenium import webdriver
from selenium.webdriver.common.by import By # contains operators for the type of search we want to do
import time
from seleniumbase import BaseCase
from random import randint
#from selenium.common.exceptions import ElementClickInterceptedException, StaleElementReferenceException, NoSuchElementException
import html
#import re
import numpy as np
from datetime import datetime
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import json
import csv
import os

## 1. Dividing Search Terms into Chunks

In [23]:
#locate json file
parent_dir = os.path.dirname(os.getcwd())
file_path = os.path.join(parent_dir, 'NYT_Data/suggested_words.json')

#open json of search terms 
with open (file_path, 'r') as f:
    search_terms_by_year = json.load(f)

#Checking search terms
print(f'Keys: {search_terms_by_year.keys()}')
print(f'Articles from 2020: {len(search_terms_by_year["2020"])}')
print(f'Articles from 2021: {len(search_terms_by_year["2021"])}')
print(f'Articles from 2022: {len(search_terms_by_year["2022"])}')
print(f'Articles from 2023: {len(search_terms_by_year["2023"])}')
print(f'Articles from 2024: {len(search_terms_by_year["2024"])}')

#correcting errors
search_terms_by_year['2024'] = search_terms_by_year['2024'][:60]
print(f'New articles from 2024: {len(search_terms_by_year["2024"])}')

Keys: dict_keys(['2020', '2021', '2022', '2023', '2024'])
Articles from 2020: 60
Articles from 2021: 60
Articles from 2022: 60
Articles from 2023: 60
Articles from 2024: 62
New articles from 2024: 60


In [24]:
#creating lists for each year
search_terms_2020 = search_terms_by_year["2020"]
search_terms_2021 = search_terms_by_year["2021"]
search_terms_2022 = search_terms_by_year["2022"]
search_terms_2023 = search_terms_by_year["2023"]
search_terms_2024 = search_terms_by_year["2024"]

## 2. Scraping TikTok Search Results

In [25]:
def get_50_urls(search_term, no_of_results=10):
    #open link using driver
    driver.get(f'https://www.tiktok.com/search/video?q={search_term}&t={no_of_results}')
    #time.sleep(20)
    #perform scrolling and collect urls
    all_urls = []
    try:
        for i in range(6):
            for i in range(30):
                actions.send_keys(Keys.ARROW_DOWN)
            actions.perform()
            time.sleep(0.75)
            videos = driver.find_elements(By.XPATH, "//div[contains(@class, 'css-1soki6-DivItemContainerForSearch e19c29qe10')]")
            [all_urls.append(video.find_element (By. TAG_NAME, "a").get_attribute('href')) for video in videos if video not in all_urls]
    except:
        print(f'Error, {search_term}')
        print('All URLs:')
        print(all_urls)
    #return search term and list of urls as a tuple
    try:
        return search_term, all_urls[:50]
    except:
        print(f'Error, {search_term}')
        return None

In [31]:
#function to take a list of keywords and run TikTok search on all of them. Outputs a json file of the search terms as 
# keys and the list of 50 urls as values. 
def run_all_search_terms(search_terms_list, year):
    result_dict = {}
    for search_term in search_terms_list:
        search_term_dct = {}
        data = get_50_urls(search_term, no_of_results=10)
        keyword = data[0]
        urls = data[1]
        search_term_dct['urls'] = urls
        search_term_dct['year'] = year
        result_dict[search_term] = search_term_dct
    with open(f'urls{year}.json', 'w') as f:
        json.dump(result_dict, f)

## 3. Calling Functions for Scraping

In [51]:
#start with driver instance so we don't have to login multiple times. 
driver = webdriver.Chrome()
driver.get(f'https://www.tiktok.com')
actions = ActionChains(driver)

In [52]:
#change parameters every time, 2020, 2021, 2022, 2023, 2024. Rerun the above cell every time this cell is run 
# to login to a new account. 
start_time = time.time()
run_all_search_terms(search_terms_2024, '2024')
end_time = time.time()
print("Execution time:", end_time-start_time, "seconds")

Execution time: 377.0831289291382 seconds


## 4. Analysing Missing Search Results

In [54]:
all_no_results = {}
try:
    for file in ['urls2020.json','urls2021.json', 'urls2022.json', 'urls2023.json', 'urls2024.json']:
        year_dct = {}
        with open (file, 'r') as inF:
            i = 0
            no_result_count = 0
            no_result_terms = []
            dct = json.load(inF)
            for search_term in dct.keys():
                if len(dct[search_term]['urls']) == 0:
                    no_result_count += 1
                    no_result_terms.append(search_term)
                if i == 0:
                    year = dct[search_term]['year']
                    i += 1
                year_dct['no_result_terms'] = no_result_terms
                year_dct['no_result_count'] = no_result_count
        all_no_results[year] = year_dct

except FileNotFoundError:
    print('File not found')
    
    with open ('no_search_results.json', 'w') as outF:
        json.dump(all_no_results, outF)

## Dividing into Chunks for Pyktok