In [1]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [27]:
# YouTube API key
API_KEY = "YOUR-API-KEY" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [28]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [29]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [5]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [30]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]

In [31]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  5%|▌         | 1/20 [00:09<02:59,  9.45s/it]

9.453901529312134s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:17<02:37,  8.74s/it]

17.697648525238037s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:24<02:14,  7.90s/it]

24.6003315448761s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:31<02:01,  7.59s/it]

31.700552463531494s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [00:39<01:57,  7.80s/it]

39.89178013801575s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [00:46<01:45,  7.52s/it]

46.84978222846985s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [00:55<01:40,  7.73s/it]

55.032273054122925s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [01:03<01:34,  7.89s/it]

63.263466119766235s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [01:09<01:21,  7.45s/it]

69.73033905029297s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [01:17<01:14,  7.48s/it]

77.27377152442932s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [01:24<01:07,  7.50s/it]

84.8300850391388s for query: 흑백요리사 안유성


 60%|██████    | 12/20 [01:32<00:59,  7.40s/it]

92.00654578208923s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [01:40<00:53,  7.62s/it]

100.13034439086914s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [01:47<00:44,  7.46s/it]

107.20563459396362s for query: 흑백요리사 조은주


 75%|███████▌  | 15/20 [01:53<00:36,  7.25s/it]

113.98473334312439s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [01:59<00:26,  6.63s/it]

119.16011834144592s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [02:05<00:19,  6.59s/it]

125.6499764919281s for query: 흑백요리사 이영숙


 90%|█████████ | 18/20 [02:12<00:13,  6.58s/it]

132.20647811889648s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [02:18<00:06,  6.50s/it]

138.52728939056396s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [02:25<00:00,  7.28s/it]

145.511155128479s for query: 흑백요리사 박준우





## Merge youtube_comments with movie_rating_dataset

In [32]:
comments = pd.read_csv("youtube_comments.csv")

In [33]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,vebF7wUQLMo,"《흑백요리사: 요리 계급 전쟁》, 9월 17일 넷플릭스에서 시청하세요: https:..."
1,vebF7wUQLMo,빽햄요리사ㄷㄷ
2,vebF7wUQLMo,0:07
3,vebF7wUQLMo,백수저중에 옴진리교 교주가 있노 ㄷㄷㄷㄷ
4,vebF7wUQLMo,심사위원 등장씬은 대한민국 역대 등장씬 고트중에 하나다 ㄹㅇ


## SKIP

In [34]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x7beadba02dd0>)

In [35]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [36]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,vebF7wUQLMo,"《흑백요리사: 요리 계급 전쟁》, 9월 17일 넷플릭스에서 시청하세요: https:..."
1,vebF7wUQLMo,빽햄요리사ㄷㄷ
2,vebF7wUQLMo,0:07
3,vebF7wUQLMo,백수저중에 옴진리교 교주가 있노 ㄷㄷㄷㄷ
4,vebF7wUQLMo,심사위원 등장씬은 대한민국 역대 등장씬 고트중에 하나다 ㄹㅇ


In [37]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

movie data length: 200000
comments data length: 41783


In [38]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

Unnamed: 0,text
0,어릴때보고 지금다시봐도 재밌어요ㅋㅋ
1,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산..."
2,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
3,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...
4,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
...,...
241778,아무리 봐도 이건 보류 에반거 같음ㅋㅋ
241779,ㅋㅋ음식이 피자같긴 함
241780,정보)모수에서도 아무맛도 안나는 식용금가루가 올라간다
241781,나폴리맛피자는 이때 떨어졌고 나폴리맛피아가 결승갔지..


In [39]:
# NULL check
print(merged_df.isnull().values.any())

True


In [40]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

False


In [41]:
print(len(merged_df)) 

241775


In [42]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)


In [19]:
# SKIP END

In [43]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [44]:
from konlpy.tag import Okt
okt = Okt()

In [45]:
# NULL check
print(comments.isnull().values.any()) # => True

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) # => False

False
False


In [46]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

# merged_df['text'] => comments['Comment']
for sentence in tqdm.tqdm(comments['Comment']): 
    sentence = str(sentence).strip()
    
    if not sentence:  # 빈 문자열이면 건너뛰기
        continue
        
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords # 조건1
                                     and len(word) >= 2 # 조건2   
                                     and word.isalpha()]  # 한글이나 영어 
    
    if stopwords_removed_sentence:  # 빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)


100%|██████████| 41783/41783 [02:15<00:00, 309.34it/s]


In [47]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [48]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [49]:
model.wv.vectors.shape

(6312, 100)

In [50]:
print(model.wv.most_similar("백종원"))

[('저기', 0.9497391581535339), ('참가자', 0.9319582581520081), ('자격', 0.9208540916442871), ('편파', 0.9189201593399048), ('의원', 0.9180790781974792), ('장사꾼', 0.9177151322364807), ('장사치', 0.9107998609542847), ('사기꾼', 0.909364640712738), ('누굴', 0.9088926911354065), ('기준', 0.9041736125946045)]


In [51]:
print(model.wv.most_similar("최현석"))

[('성재', 0.9697535037994385), ('여경', 0.9542917013168335), ('셰프', 0.9492605328559875), ('쉐프', 0.9487934708595276), ('정지선', 0.9341416954994202), ('에드워드', 0.9108306765556335), ('안유', 0.8881122469902039), ('최고', 0.8831294775009155), ('최강', 0.877565324306488), ('명장', 0.876835823059082)]


## Save W2V model

In [52]:
model.wv.save_word2vec_format('ko_w2v')

In [53]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

## Visualization for embedding

In [None]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv