In [1]:
import os
import time
import pandas as pd
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium import webdriver
# database python file
import mysql.connector 
from mysql.connector import Error
from sqlalchemy import create_engine
from datetime import date, datetime,timedelta

In [2]:
#utilization for selenium browser
#all folder and files
dir_path = os.path.dirname(os.path.realpath("assesment_cct"))

def new_browser():

    #Global Variables
    PATH = "chromedriver.exe"
    chromeOptions = webdriver.ChromeOptions()
    chromeOptions.add_argument("start-maximized")

    # Chrome is controlled by automated test software
    chromeOptions.add_experimental_option("excludeSwitches", ["enable-automation"])
    chromeOptions.add_experimental_option('useAutomationExtension', False)

    # avoiding detection
    chromeOptions.add_argument('--disable-blink-features=AutomationControlled')

    # Default User Profile
    chromeOptions.add_argument("--profile-directory=Default")
    chromeOptions.add_argument("--user-data-dir=C:/ChromeProfiles/User Data")
    #chromeOptions.add_argument("--user-data-dir=C:/Users/Admin/AppData/Local/Google/Chrome/User Data")

    #Setting up webdriver
    #prefs = {"download.default_directory" : dir_path + "/files/}
    prefs = {"download.default_directory" : dir_path}
    chromeOptions.add_experimental_option("prefs",prefs)
    driver = webdriver.Chrome(executable_path=PATH, options=chromeOptions)
    driver.set_script_timeout(1000)
    return driver

In [3]:
# database functions
## Create connection
def get_connection():
    return mysql.connector.connect(host='127.0.0.1',
                                   user='root',
                                   password='',
                                   charset="utf8mb4")

## create a Assesment database
def create_db():
    try:
        connection = get_connection()
        if connection.is_connected():
            cursor = connection.cursor()
            cursor.execute("CREATE DATABASE IF NOT EXISTS assesment;")
            print("Database is created...")
            connection.commit()
    except Exception as e:
        print("Error while connecting to MySQL", e)


## create tables 
def create_table():
    try:
        connection = get_connection()
        if connection.is_connected():
            cursor = connection.cursor()
            cursor.execute("USE assesment;")
            cursor.execute(
                """CREATE TABLE IF NOT EXISTS urls( video_url VARCHAR(255),keyword VARCHAR(255)) ;""") 
            # comment table
            #'primary_keyword','secondary_keyword','video_url','content','comment','authors_name','comment_date'
            cursor.execute( """CREATE TABLE IF NOT EXISTS comments(keyword VARCHAR(100),video_url VARCHAR(255), content TEXT, comment TEXT, authors_name VARCHAR(255) ,comment_date VARCHAR(255));""")
            print("Table is created...")
            connection.commit()
    except Exception as e:
        print("Error while connecting to MySQL", e)

## execute any queries on database
def execute_query(query):
    try:
        connection = get_connection()
        if connection.is_connected():
            cursor = connection.cursor()
            cursor.execute("USE assesment;")
            cursor.execute(query)
            connection.commit()
            print("Query finished")
    except Exception as e:
            print("Query is failed:" + query)
    
## get query result as a dataframe
def get_data(query):
    try:
        connection = get_connection()
        if connection.is_connected():
            cursor = connection.cursor()
            cursor.execute("USE assesment;")
            cursor.execute(query)
            cols = list(map(lambda x: x[0], cursor.description))
            #result_list=cursor.fetchall()
    except Exception as e:
            print("Query is failed")
    df=pd.DataFrame(cursor.fetchall(),columns=cols)
    return df

## insert data to table from dataframe
def insertData(df,tablename):
    try:
        conn = create_engine("mysql://root@localhost/assesment")
        df.to_sql(tablename,con=conn,if_exists='append',index=False)
        print("Inserting complete...")
    except Exception as e:
        print("Insertin failed :",e)


In [4]:
# scraping video url about Ireland housing crisis
## HTML elements' class names for scraping
videoBlockAll='eegew6e0'
videoUrl='e1cg0wnj1'

#scroll page 10 times
def scrollPage(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for i in range(10):
        # Scroll down to drivertom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        try:
            # Wait to load page
            time.sleep(1)
            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        except Exception as err:
            print("There is no video...")
            return err

## get video urls 
def getVideoUrl(driver,srcKeyword):
    print("Scraping urls...")
    links=[]
    #all videos element
    blk=driver.find_element(By.CLASS_NAME,videoBlockAll)
    videos=blk.find_elements(By.TAG_NAME,'a')
    links=[]
    for a in videos:
        if str(a.get_attribute('href')).startswith("https://www.tiktok.com/@") and a.get_attribute('href'): 
            if "/video/" in str(a.get_attribute('href')):
                links.append(a.get_attribute('href'))
    #print(len(links))
    df = pd.DataFrame()  
    df["video_url"]=links
    df["keyword"]=srcKeyword
    return df

def searchKeywords(driver,srcKeyword):
    print("searching keywords :" + srcKeyword)
    search_url =  str.replace(srcKeyword,' ','%20')
    driver.get("https://www.tiktok.com/search?q=" + search_url)
    time.sleep(5)
    scrollPage(driver)

In [None]:
## create db and tables
create_db()
create_table()

In [None]:
# save urls at the db
driver=new_browser()
srcKeyword="house building Ireland"
searchKeywords(driver,srcKeyword)
df=getVideoUrl(driver,srcKeyword)
insertData(df,'urls')
driver.close()

In [14]:
## scrap comments from urls
#All XPATHs and class names
videoContent='efbd9f0'
videoHastag='ejg0rhn1'
firstLevelComments='eo72wou0'

def getCommentDate(strDate): 
        f=str(strDate)
        weekAgo='w ago'
        dayAgo='d ago'
        hourAgo='h ago'
        minuteAgo='m ago'
        cDate=''
        if (minuteAgo in f):
            stop=f.find('m')
            mins=int(f[0:stop])
            cDate=str(datetime.now()-timedelta(minutes=mins))[:10]
        elif (hourAgo in f):
            stop=f.find('h')
            hour=int(f[0:stop])
            cDate=str(datetime.now()-timedelta(hours=hour))[:10]
        elif (dayAgo in f):
            stop=f.find('d')
            day=int(f[0:stop])
            cDate=str(date.today()-timedelta(days=day))
        elif (weekAgo in f):
            stop=f.find('w')
            week=int(f[0:stop])
            cDate=str(date.today()-timedelta(weeks=week))
        elif (len(f)<6):
            today=date.today()
            year=today.strftime("%Y")
            cDate=year+"-"+f
        else:
            cDate=f

        return cDate


def getAllComments(driver,url):
    #read the video content
    cnx=driver.find_elements(By.CLASS_NAME,videoContent)
    htg=driver.find_elements(By.CLASS_NAME,videoHastag)
    hashtag=''
    content=''
    for x in cnx:
        if(x.text != ''):
            content=content+x.text
    for y in htg:
        if(y.text != ''):
            hashtag=hashtag+y.text
    content=content+' '+ hashtag        
    content=content.replace(',','')
    #video_id,content,comment,authors_name,comment_date
    cols=['video_url','content','comment','authors_name','comment_date']
    df=pd.DataFrame(columns=cols)
    #level-1 coments class
    a=driver.find_elements(By.CLASS_NAME,firstLevelComments)
    for i in a:
        user=i.find_element(By.XPATH,'.//div[1]/div[1]/a/span').text
        comment=i.find_element(By.XPATH,'.//div[1]/div[1]/p[1]/span').text
        comment_date=i.find_element(By.XPATH,'.//div[1]/div[1]/p[2]/span[1]').text
        cd=getCommentDate(comment_date)
        row={'video_url':url,'content':content,'comment':comment,'authors_name':user,'comment_date':cd}
        df = pd.concat([df, pd.DataFrame.from_records([row])])
    
    return df 

def scrollPage(driver,cnt):
    scrollCount=cnt
    for i in range(scrollCount):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        #print("....scroll...")
        time.sleep(3)
        try:
            captcha=driver.find_element(By.CLASS_NAME,"zGYIR")
            if captcha.is_displayed():
                print("Captcha is visible, please solve it in 10!...")
                os.system('say "hey"')
                time.sleep(10)
        except Exception as err:
            continue

def loadComments(driver,url):
    print("going to :"+url)
    driver.get(url)
    time.sleep(3)
    cols=['video_url','content','comment','authors_name','comment_date']
    df_comment = pd.DataFrame(columns=cols)
    numberComments_text='10'
    scrollCount=0
    try:
        #comments number label for scrolling
        numberComments_text=driver.find_element(By.XPATH,'//*[@id="main-content-video_detail"]/div/div[2]/div[1]/div[1]/div[1]/div[4]/div/button[2]/strong').text
        print("Comment count... : ",numberComments_text)
        if ('K' in numberComments_text ):
            numberComments=200
        else:
            numberComments=int(numberComments_text)
    
        if (numberComments==0):
            print("There is no comments")
            pass
        else:
            scrollCount=int(numberComments/20)
            if (scrollCount<1):
                scrollCount=1
                time.sleep(3)
                scrollPage(driver,scrollCount)
                df_temp=getAllComments(driver,url)
                df_comment = pd.concat([df_comment, df_temp],ignore_index=True)
            elif (scrollCount>=1 and scrollCount<5):
                #driver.find_element(By.CLASS_NAME,'edu4zum1').click()
                time.sleep(3)
                scrollPage(driver,scrollCount)
                df_temp=getAllComments(driver,url)
                df_comment = pd.concat([df_comment, df_temp],ignore_index=True)
            else:
                #driver.find_element(By.CLASS_NAME,'edu4zum1').click()
                time.sleep(3)
                scrollCount=4
                scrollPage(driver,scrollCount)
                df_temp=getAllComments(driver,url)
                df_comment = pd.concat([df_comment, df_temp],ignore_index=True)
    except Exception as err:
        #count_not_catch +=1
        a=str(err)
        stop=a.find('Session')-4
        #print(start,stop, a[start:stop])
        print('Error reading video ' + url)
        print(a)
    return df_comment

In [15]:
## get urls from db and scrap comments
# get comments and save as temp

# query to get urls from db
query_get_urls=""" select distinct video_url from urls; """
driver=new_browser()

url_df=get_data(query_get_urls)
url_list=list(url_df["video_url"])

for url in url_list:
    print(url)
    df=loadComments(driver,url)
    insertData(df,'comments')
driver.close()


  driver = webdriver.Chrome(executable_path=PATH, options=chromeOptions)


https://www.tiktok.com/@karll1994/video/6992919612174241029
going to :https://www.tiktok.com/@karll1994/video/6992919612174241029
Comment count... :  196
Inserting complete...
https://www.tiktok.com/@castlequarterhouse/video/6958057152850185477
going to :https://www.tiktok.com/@castlequarterhouse/video/6958057152850185477
Comment count... :  1786
Inserting complete...
https://www.tiktok.com/@simonava91/video/7185324986792529157
going to :https://www.tiktok.com/@simonava91/video/7185324986792529157
Comment count... :  2
Inserting complete...
https://www.tiktok.com/@orla.leech3/video/7207933566767009030
going to :https://www.tiktok.com/@orla.leech3/video/7207933566767009030
Comment count... :  24
Inserting complete...
https://www.tiktok.com/@briancampbell870/video/7045193820157316357
going to :https://www.tiktok.com/@briancampbell870/video/7045193820157316357
Comment count... :  214
Inserting complete...
https://www.tiktok.com/@linahome98/video/7036853442735820038
going to :https://www.t