In [None]:
#Data: This section will describe my procedures for collecting my data.
#These lines are designed to import both pandas and selenium in order to operate the scraper as well as store the data.
#This code is adapted from Professor Jeong's code provided to us in class.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

#This is the input for the URL, specifically that of "climate" videos, and only videos.
#The following lines open the window in order to perform the scrape.
url = 'https://www.youtube.com/results?search_query=climate&sp=EgIQAQ%253D%253D'
driver = webdriver.Chrome()
driver.get(url)

#This section, taken from Professor Jeong, is designed to scroll to the bottom of the page and collect the video data.
def scroll_to_bottom(driver):
    old_position = 0
    new_position = None
    
    while (new_position != old_position):
        old_position = driver.execute_script(("return (window.pageYOffset !== undefined) ?"
                                            " window.pageYOffset : (document.documentElement ||"
                                            " document.body.parentNode || document.body);"))
        time.sleep(5)
        driver.execute_script(("var scrollingElement = (document.scrollingElement ||"
                              " document.body);scrollingElement.scrollTop = "
                              " scrollingElement.scrollHeight"))
        
        time.sleep(5)
        new_position = driver.execute_script(("return (window.pageYOffset !== undefined) ?"
                                             " window.pageYOffset : (document.documentElement ||"
                                             " document.body.parentNode || documentBody);"))        
scroll_to_bottom(driver)

#This following segment, adapted from Professor Jeong, will tally up the links of the various videos.
user_data = driver.find_elements(by=By.XPATH,value='//*[@id="video-title"]')
links = []
for i in user_data:
    if (i.get_attribute('href') != None):
        links.append(i.get_attribute('href'))
#Next, a dataframe is created to store the link, title, view count, like count, and dislike count.

df = pd.DataFrame(columns = ['link', 'title', 'views', 'likes', 'dislikes'])

#This next segment uses the links in order to extract the remainder of the data.

for x in links:
    driver.get(x)
    v_id = x
    v_title = wait.until(EC.presence_of_element_located(
                   (By.CSS_SELECTOR,"h1.style-scope.ytd-watch-metadata yt-formatted-string"))).text
    view_count = driver.find_element_by_css_selector(".view-count").text
    like_count = driver.find_element_by_css_selector(".like-button-renderer-like-button span").text
    dislike_count = driver.find_element_by_css_selector(".like-button-renderer-dislike-button span").text

#This next line adds all the variables to a dataframe.
    df.loc[len(df)] = [v_id, v_title, view_count, like_count, dislike_count]
    

driver.quit()
print(df)

#This saves the CSV file.
df.to_csv('youtubeVideoData.csv', index=False)

In [None]:
#Data Preparation: This section will describe my procedures for preparing my data.

#import the NLTK libraries, including the stopwords that will be removed later
import nltk
nltk.download("stopwords")

#load csv file into pandas dataframe
df = pd.read_csv("youtubeVideoData.csv")

#convert the string columns into int columns, easier to do it here than when you initially collect it.
df['views'] = df['views'].astype('int')
df['likes'] = df['likes'].astype('int')
df['dislikes'] = df['dislikes'].astype('int')


#alter the dataframe to insert two new columns that consist of the ratios of the likes, dislikes, and views
df["Like/view ratio"] = df["Likes"] / df["Views"]
df["Dislike/view ratio"] = df["Dislikes"] / df["Views"]

#Using NLTK's native stopward removal, we define a method to remove all the stopwards, and then apply it to the title column
def remove_stopwords("text"):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(text)
    #these are all remaining words not in the NLTK stopword database
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

#apply remove_stopwards to the title column
df["Title"] = df["Title"].apply(remove_stopwords())


In [None]:
#Analysis: This section will describe my procedures for preparing my data
#Import NLTK, download the VADER lexicon.
import nltk
nltk.download("vader_lexicon")

#import the sentiment analyzer
from nltk.sentiment import SentimentIntensityAnalyzer
#initialize
sentiment = SentimentIntensityAnalyzer()

#add the compound value from the sentiment analysis to the dataframe under "sentiment value"
#we're using the compound value because it is a single value that expresses the overall positive/negative score rather than multiple
#also it only adds one column.
df["Sentiment Value"] = df["Title"].apply(sentiment.polarity_scores(x)["compound"])

#Now comes correlation analysis. Code adapted from Professor Jeong's code from Google Colab
#import seaborns
import seaborn as sns
#import pyplot
import matplotlib.pyplot as plt

#creating the heatmap, defining the color and size of the image produced.
dataplot = sns.heatmap(df, cmap="YlGnBu", annot=True)
plt.figure(figsize=(30, 15))
plt.show()
dataplot
