In [1]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [2]:
# YouTube API key
API_KEY = "YOUR-API-KEY" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [4]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [5]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [7]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [8]:
participants = ["ASMR", "불면증", "수면유도", "자장가", "빗소리", "백색소음", "롤플레이", "이팅사운드", "귀청소", "no talking"]

In [9]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_baisic = "ASMR"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

 10%|█         | 1/10 [00:08<01:16,  8.54s/it]

8.546106576919556s for query: ASMR ASMR
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=w1zYrRMhBks&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=O4uiORxFGg0&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/

 20%|██        | 2/10 [00:17<01:09,  8.70s/it]

17.35991144180298s for query: ASMR 불면증
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=-X_rCJFIx38&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 30%|███       | 3/10 [00:25<00:58,  8.29s/it]

25.1632182598114s for query: ASMR 수면유도
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=OzhsoSo-4dU&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=HvbQM_Pd1e0&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v

 40%|████      | 4/10 [00:33<00:48,  8.14s/it]

33.06878995895386s for query: ASMR 자장가
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=-X_rCJFIx38&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=ltYWNPlmfvs&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v

 50%|█████     | 5/10 [00:40<00:39,  7.98s/it]

40.762699127197266s for query: ASMR 빗소리
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=-X_rCJFIx38&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=ZTgzeACTogw&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/

 60%|██████    | 6/10 [00:48<00:31,  7.83s/it]

An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=w1zYrRMhBks&maxResults=100&textFormat=plainText&key=AIzaSyDehIQD4gav9Syv7H6SNXUkDjPgN0ltBSk&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
48.30150485038757s for query: ASMR 백색소음


 70%|███████   | 7/10 [00:57<00:24,  8.15s/it]

57.10730504989624s for query: ASMR 롤플레이


 80%|████████  | 8/10 [01:04<00:16,  8.03s/it]

64.89784860610962s for query: ASMR 이팅사운드


 90%|█████████ | 9/10 [01:13<00:08,  8.29s/it]

73.75437211990356s for query: ASMR 귀청소


100%|██████████| 10/10 [01:21<00:00,  8.16s/it]

81.61773157119751s for query: ASMR no talking





## Merge youtube_comments with movie_rating_dataset

In [10]:
comments = pd.read_csv("youtube_comments.csv")

In [11]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,qk32KfRxJQo,Sin ofender pero te pareces el del juego del c...
1,qk32KfRxJQo,No
2,qk32KfRxJQo,"I love this as a video concept, like bouncing ..."
3,qk32KfRxJQo,Why is it actually kind of better😅
4,qk32KfRxJQo,If you look closely his right hand. Only has 3...


## SKIP

In [34]:
# import urllib.request
# download naver movie ratings dataset
# urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x7beadba02dd0>)

In [35]:
# movie_data = pd.read_table('ratings.txt')
# movie_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [36]:
# comments.head()

Unnamed: 0,Video_ID,Comment
0,vebF7wUQLMo,"《흑백요리사: 요리 계급 전쟁》, 9월 17일 넷플릭스에서 시청하세요: https:..."
1,vebF7wUQLMo,빽햄요리사ㄷㄷ
2,vebF7wUQLMo,0:07
3,vebF7wUQLMo,백수저중에 옴진리교 교주가 있노 ㄷㄷㄷㄷ
4,vebF7wUQLMo,심사위원 등장씬은 대한민국 역대 등장씬 고트중에 하나다 ㄹㅇ


In [37]:
# print(f"movie data length: {len(movie_data)}")
# print(f"comments data length: {len(comments)}")

movie data length: 200000
comments data length: 41783


In [38]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
# df1_text = movie_data[['document']].rename(columns={'document': 'text'})
# df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
# merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
# merged_df

Unnamed: 0,text
0,어릴때보고 지금다시봐도 재밌어요ㅋㅋ
1,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산..."
2,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
3,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...
4,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
...,...
241778,아무리 봐도 이건 보류 에반거 같음ㅋㅋ
241779,ㅋㅋ음식이 피자같긴 함
241780,정보)모수에서도 아무맛도 안나는 식용금가루가 올라간다
241781,나폴리맛피자는 이때 떨어졌고 나폴리맛피아가 결승갔지..


In [39]:
# NULL check
# print(merged_df.isnull().values.any())

True


In [40]:
# merged_df = merged_df.dropna(how = 'any') # drop rows with null values
# print(merged_df.isnull().values.any()) 

False


In [41]:
# print(len(merged_df)) 

241775


In [42]:
# remove all characters other than Hangeul
# merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)


In [19]:
# SKIP END

In [12]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading jpype1-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.1/494.1 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.2 konlpy-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
from konlpy.tag import Okt
okt = Okt()

In [14]:
# NULL check
print(comments.isnull().values.any()) # => True

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) # => False

True
False


In [15]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

# merged_df['text'] => comments['Comment']
for sentence in tqdm.tqdm(comments['Comment']): 
    sentence = str(sentence).strip()
    
    if not sentence:  # 빈 문자열이면 건너뛰기
        continue
        
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords # 조건1
                                     and len(word) >= 2 # 조건2   
                                     and word.isalpha()]  # 한글이나 영어 
    
    if stopwords_removed_sentence:  # 빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)


100%|██████████| 34599/34599 [00:56<00:00, 613.55it/s] 


In [16]:
pip install gensim

Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
libpysal 4.9.2 requires packaging>=2

In [18]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [19]:
model.wv.vectors.shape

(6340, 100)

In [22]:
print(model.wv.most_similar("수면"))

[('도움', 0.9930249452590942), ('처음', 0.9922367930412292), ('여기', 0.9922339916229248), ('인데', 0.9920335412025452), ('찾다', 0.9920139908790588), ('채널', 0.9919416904449463), ('음악', 0.9918728470802307), ('불면증', 0.9918643832206726), ('때문', 0.9918344020843506), ('알다', 0.9918280839920044)]


In [23]:
print(model.wv.most_similar("소리"))

[('목소리', 0.9959572553634644), ('듣기', 0.9955896735191345), ('넘다', 0.9943392872810364), ('너무', 0.9937055706977844), ('속삭이다', 0.9866172075271606), ('예쁘다', 0.9850531220436096), ('ㅠㅠ', 0.9848223924636841), ('진짜', 0.9845096468925476), ('제일', 0.9834099411964417), ('노래', 0.9832368493080139)]


## Save W2V model

In [24]:
model.wv.save_word2vec_format('ko_w2v')

In [26]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

## Visualization for embedding

In [None]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv