# [빅데이터 5조] 03. EDA
### 목차
- [3.1 코로나 확진자 추이와 스팀 게임 이용자수 데이터](#3.1-코로나-확진자-추이와-스팀-게임-이용자수-데이터)
- [3.2 언어 분리 및 EDA를 위한 언어별 1차 EDA](#3.2-언어-분리-및-EDA를-위한-언어별-1차-EDA)
- [3.3 분석 언어 별 2차 EDA](#3.3-분석-언어-별-2차-EDA)

In [3]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as preprocessing
from datetime import datetime
import gc
import os
import re

---
### 3.1 코로나 확진자 추이와 스팀 게임 이용자수 데이터

In [96]:
# ap is Average Players
ap_df = pd.read_csv("./dataset/interim_data/avg_players_sum.csv", index_col=0, encoding="utf-8-sig")
display(ap_df)

# cv is Covid
cv_df = pd.read_csv("./dataset/interim_data/covid.csv", index_col=0, encoding="utf-8-sig")
display(cv_df)

Unnamed: 0,avg_players
2012-07,527483.97
2012-08,499853.58
2012-09,527756.88
2012-10,502165.89
2012-11,585970.59
...,...
2021-05,4579001.33
2021-06,4356320.79
2021-07,4497112.90
2021-08,4628024.88


Unnamed: 0,total_confirm
2020-01-23,3
2020-01-24,9
2020-01-25,7
2020-01-26,10
2020-01-27,6
...,...
2021-11-25,589051
2021-11-26,591520
2021-11-27,409868
2021-11-28,427210


In [97]:
# 스팀 평균 이용자 수 데이터를 확진자 추이와 비교하기 위한 데이터 처리
start_date = "2018-05" # 데이터 수집 단계에서 계산한 분석 시작일
ap_df = ap_df[ap_df.index >= start_date]

# 시계열 데이터 형식을 수정(월별 평균 수치의 의미를 표현하기 위해 15일로 설정)
ap_df.index = [date+"-15" for date in ap_df.index]

# 최소 이용자수, 최대 이용자수의 범위가 0~1의 범위를 갖는 스케일로 정규화
preprocessor = preprocessing.MinMaxScaler()
preprocessor.fit(ap_df)
output = preprocessor.transform(ap_df)
ap_df = pd.DataFrame(output, columns=ap_df.columns, index=list(ap_df.index.values))

display(ap_df)
ap_df.to_csv("./dataset/interim_data/avg_players_scale.csv", encoding="utf-8-sig")

Unnamed: 0,avg_players
2018-05-15,0.160299
2018-06-15,0.226254
2018-07-15,0.176797
2018-08-15,0.166021
2018-09-15,0.101468
2018-10-15,0.012089
2018-11-15,0.017478
2018-12-15,0.189842
2019-01-15,0.245099
2019-02-15,0.151856


In [98]:
# 더미 값을 추가하여 게임 이용자수 데이터와의 기간을 맞춤
dummy_range = pd.date_range(start_date, cv_df.index.min(), freq="D")[:-1]
dummy_df = pd.DataFrame(columns=["total_confirm"], index=[date.strftime("%Y-%m-%d") for date in dummy_range])
dummy_df = dummy_df.fillna(0)
cv_df = pd.concat([dummy_df, cv_df[cv_df.index <= "2021-09-15"]])

# 확진자 수의 범위가 0~1의 범위를 갖는 스케일로 정규화
preprocessor = preprocessing.MinMaxScaler()
preprocessor.fit(cv_df)
output = preprocessor.transform(cv_df)
cv_df = pd.DataFrame(output, columns=cv_df.columns, index=list(cv_df.index.values))

display(cv_df)
cv_df.to_csv("./dataset/interim_data/covid_scale.csv", encoding="utf-8-sig")

Unnamed: 0,total_confirm
2018-05-01,0.000000
2018-05-02,0.000000
2018-05-03,0.000000
2018-05-04,0.000000
2018-05-05,0.000000
...,...
2021-09-11,0.504235
2021-09-12,0.403825
2021-09-13,0.654788
2021-09-14,0.601144


---
### 3.2 언어 분리 및 언어별 1차 EDA

In [3]:
# tr is Total Review
tr_df = pd.read_csv("./dataset/total_reviews.csv", encoding="utf-8")

display(tr_df)

Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85185598,schinese,不玩此生遗憾，RPG游戏里的天花板，太吸引人了,True,1611381629,1
1,292030,The Witcher 3: Wild Hunt,85185250,schinese,拔DIAO无情打桩机--杰洛特!!!,True,1611381030,1
2,292030,The Witcher 3: Wild Hunt,85185111,schinese,巫师3NB,True,1611380800,1
3,292030,The Witcher 3: Wild Hunt,85184605,english,"One of the best RPG's of all time, worthy of a...",True,1611379970,1
4,292030,The Witcher 3: Wild Hunt,85184287,schinese,大作,True,1611379427,1
...,...,...,...,...,...,...,...,...
32154266,99910,Puzzle Pirates,3692155,english,This is not even a game,True,1314818724,2
32154267,99910,Puzzle Pirates,4498620,english,SHIVER ME TIMBERS! ...Why does steam give me t...,False,1314818654,3
32154268,99910,Puzzle Pirates,276151,english,Highly recommended blend of MMO and puzzles. T...,True,1314814692,2
32154269,99910,Puzzle Pirates,546554,english,Yarrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr!,True,1314812971,2


In [4]:
# 언어권 별 분리
gb_tr_df = tr_df.groupby("language")

display(gb_tr_df.count()[["app_id", "review"]].sort_values(by="review", ascending=False, na_position="last"))

Unnamed: 0_level_0,app_id,review
language,Unnamed: 1_level_1,Unnamed: 2_level_1
english,14765410,14765410
schinese,5049618,5049618
russian,3436895,3436895
portuguese,1385678,1385678
spanish,1251927,1251927
german,1132888,1132888
turkish,938469,938469
koreana,828865,828865
french,811982,811982
polish,745447,745447


In [None]:
# 1차 언어 선별
# 표본수, 언어 체계의 난이도 등을 고려하여 1차 EDA 대상에서 제외
# - turkish, polish, czech, latam, dutch, hungarian, danish, romanian, greek, bulgarian, vietnamese

single_lang_list = [gb_tr_df.get_group(x) for x in gb_tr_df.groups]
target_lang_list = ["koreana", "japanese", "tchinese", "schinese", "thai",
                    "english", "french", "german", "italian", "norwegian", "swedish", "polish"
                    "portuguese", "spanish", "russian"]

In [None]:
# 중간 데이터 형태의 전처리 데이터 디렉토리 생성
try:
    if not os.path.exists("./dataset/preprocessed"):
        os.makedirs("./dataset/preprocessed")
except OSError as e:
        print(e)
        
try:
    if not os.path.exists("./dataset/preprocessed/default"):
        os.makedirs("./dataset/preprocessed/default")
except OSError as e:
        print(e)

# 언어 별 리뷰 내용 자체를 일괄적으로 전처리하는 함수
def preprocess_review(review):
    # 명시적 형변환
    review = str(review)
    
    # 개행문자 제거
    review.replace("\n", " ")
    
    # 스팀 전용 태그 삭제(e.g. [u][\u]...)
    review = re.sub(r"[.+?]", "", review)
    
    # html 태그 삭제
    review = re.sub(r"<.+?>", "", review)
    
    return review if review else None
        
# sl is Single Language
for sl_df in single_lang_list:
    language = sl_df.iloc[0]["language"]       

    sl_df["review"] = sl_df[["review"]].apply(
        lambda x: preprocess_review(x["review"]), axis=1
    )

    sl_df.to_csv(f"./dataset/preprocessed/default/{str(language)}.csv", encoding="utf-8-sig", index=False)

    # 명시적 메모리 관리
    del [[sl_df]]
    gc.collect()
    sl_df = pd.DataFrame()

In [12]:
# 저장 결과 확인
for i, file in enumerate(os.listdir("./dataset/preprocessed/default")):
    print(str(i+1).zfill(2), file)

01 english.csv
02 finnish.csv
03 french.csv
04 german.csv
05 italian.csv
06 japanese.csv
07 koreana.csv
08 norwegian.csv
09 portuguese.csv
10 russian.csv
11 schinese.csv
12 spanish.csv
13 swedish.csv
14 tchinese.csv
15 thai.csv


In [10]:
# 2차 언어 선별
# 언어별 리뷰의 샘플 데이터를 통해 약식 EDA 진행 후, 2차 EDA와 데이터 분석 및 시각화를 진행할 언어 선별
# koreana, japanese, schinese, thai, english, german
target_lang_list = ["koreana", "japanese", "schinese", "thai", "english", "german"]

# 2차 EDA를 진행할 언어별로 분석에 필요한 값들을 데이터프레임 형식으로 작성
lang_data_dict = {
    # 언어 코드
    "code" : {
        "koreana": "ko",
        "japanese": "ja",
        "schinese": "zh-cn",
        "thai": "th",
        "english": "en",
        "german": "de",
    },
    # 언어 별 폰트
    "font_path" : {
        "koreana": "./font/NotoSansKR-Bold.otf",
        "japanese": "./font/NotoSansJP-Bold.otf",
        "schinese": "./font/NotoSansSC-Bold.otf",
        "thai": "./font/Itim-Regular.ttf",
        "english": "./font/NotoSans-Bold.ttf",
        "german": "./font/NotoSans-Bold.ttf",
    },
    # 유니코드를 활용한 정규식 패턴
    "unicode_pattern": {
        "koreana": r"[^\uAC00-\uD7AF\u1100-\u11FF ]+",
        "japanese": r"[^\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u4E00-\u9FD5 ]+",
        "schinese": r"[^\u4E00-\u9FFF ]+",
        "thai": r"[^\u0E00-\u0E7F ]+",
        "english": r"[^A-Za-z' ]+", # 띄어쓰기 중요, 다른 언어도 공백 유지 추가
        "german": r"[^A-Za-z\u00C4\u00D6\u00DC\u00DF\u00E4\u00F6\u00FC ]+",
    }
}

# 원하는 값을 닷 연산자로 직관적으로 접근하기 위한 데이터프레임 생성
lang_data = pd.DataFrame([], columns=lang_data_dict.keys(), index=target_lang_list)

for lang in lang_data.index:
    lang_data.loc[lang] = [lang_data_dict[column][lang] for column in lang_data.columns]

display(lang_data)

# 언어별 자연어 처리 후 중간 데이터 저장 경로 생성
try:
    if not os.path.exists("./dataset/preprocessed/nlp"):
        os.makedirs("./dataset/preprocessed/nlp")
except OSError as e:
        print(e)

Unnamed: 0,code,font_path,unicode_pattern
koreana,ko,./font/NotoSansKR-Bold.otf,[^\uAC00-\uD7AF\u1100-\u11FF ]+
japanese,ja,./font/NotoSansJP-Bold.otf,[^\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u4E0...
schinese,zh-cn,./font/NotoSansSC-Bold.otf,[^\u4E00-\u9FFF ]+
thai,th,./font/Itim-Regular.ttf,[^\u0E00-\u0E7F ]+
english,en,./font/NotoSans-Bold.ttf,[^A-Za-z' ]+
german,de,./font/NotoSans-Bold.ttf,[^A-Za-z\u00C4\u00D6\u00DC\u00DF\u00E4\u00F6\u...


**2차 EDA를 진행하지 않은 언어들의 약식 EDA 자료 목록**
- [finnish](./onlyEDA/finnish.ipynb)
- [french](./onlyEDA/french.ipynb)
- [italian](./onlyEDA/italian.ipynb)
- [norwegian](./onlyEDA/norwegian.ipynb)
- [portuguese](./onlyEDA/portuguese.ipynb)
- [russian](./onlyEDA/russian.ipynb)
- [spanish](./onlyEDA/spanish.ipynb)
- [swedish](./onlyEDA/swedish.ipynb)
- [tchinese](./onlyEDA/tchinese.ipynb)

---
### 3.3 분석 언어 별 2차 EDA

#### 3.2.1 koreana

In [46]:
# 언어 설정
lang = "koreana"

# 언어별 자연어 처리 패키지 설정
import konlpy
from konlpy.tag import Komoran
konlpy.jvm.init_jvm(jvmpath=None, max_heap_size=8192)
komoran = Komoran()

# 전처리된 리뷰 데이터 로드
print(f"[{lang}] - before", end=" ")
review_df = pd.read_csv(f"./dataset/preprocessed/default/{lang}.csv", encoding="utf-8-sig")
print(review_df.shape)
display(review_df.head(5))

# 언어별 명사 추출 함수
def nlp_kr(review):    
    # 텍스트 형태의 데이터가 float으로 불러와지는 경우를 방지하기 위해 명시적으로 형변환
    review = str(review)
    
    # 정규표현식을 활용한 문자 추출
    pattern = lang_data.unicode_pattern[lang]
    review = re.sub(pattern, "", review)
    
    # 명사 추출
    nouns_list = komoran.nouns(review)
    
    if nouns_list:
        return " ".join(nouns_list)
    else:
        # 필터링 결과가 공백인 경우 명시적으로 결측치 처리
        return np.nan

review_df["review"] = review_df[["review"]].apply(
    lambda x: nlp_kr(x["review"]), axis=1
)

# 명사 추출 결과가 결측치인 행 제거
review_df = review_df.dropna(axis=0, how="any")

# 명사 추출 결과
print(f"[{lang}] - after", end=" ")
print(review_df.shape)
display(review_df.head(5))
review_df.to_csv(f"./dataset/preprocessed/nlp/{lang}.csv", index=False, encoding="utf-8-sig")

[koreana] - before (828865, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85181114,koreana,가성비 짱,True,1611374195,1
1,292030,The Witcher 3: Wild Hunt,85175979,koreana,말이 필요한가,True,1611366159,1
2,292030,The Witcher 3: Wild Hunt,85152637,koreana,명불허전 최고의 RPG 2회차는 필수!!,True,1611333951,1
3,292030,The Witcher 3: Wild Hunt,85135603,koreana,스토리는 좋다는데 왜 이 하기꺼림직한 느낌은뭐지,False,1611311582,4
4,292030,The Witcher 3: Wild Hunt,85132993,koreana,스토리가 좋네요 부가 퀘스트도 스토리가 좋아 영화를 보는 기분이네요,True,1611307351,1


[koreana] - after (652516, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85181114,koreana,성비,True,1611374195,1
1,292030,The Witcher 3: Wild Hunt,85175979,koreana,말 필요,True,1611366159,1
2,292030,The Witcher 3: Wild Hunt,85152637,koreana,명불허전 최고 회차 필수,True,1611333951,1
3,292030,The Witcher 3: Wild Hunt,85135603,koreana,스토리 느낌 지,False,1611311582,4
4,292030,The Witcher 3: Wild Hunt,85132993,koreana,스토리 부가 퀘스트 스토리 영화 기분,True,1611307351,1


#### 3.2.2 japanese

In [91]:
# 언어 설정
lang = "japanese"

# 언어별 자연어 처리 패키지 설정
import nagisa

# 전처리된 리뷰 데이터 로드
print(f"[{lang}] - before", end=" ")
review_df = pd.read_csv(f"./dataset/preprocessed/default/{lang}.csv", encoding="utf-8-sig")
print(review_df.shape)
display(review_df.head(5))

# 언어별 명사 추출 함수
def nlp_ja(review):    
    # 텍스트 형태의 데이터가 float으로 불러와지는 경우를 방지하기 위해 명시적으로 형변환
    review = str(review)
    
    # 정규표현식을 활용한 문자 추출
    pattern = lang_data.unicode_pattern[lang]
    review = re.sub(pattern, "", review)
    
    # 명사 추출
    result = nagisa.tagging(review)
    nouns_list = [w[0] for w in zip(result.words, result.postags) if w[1] in ["名詞"]]
    
    if nouns_list:
        return " ".join(nouns_list)
    else:
        # 필터링 결과가 공백인 경우 명시적으로 결측치 처리
        return np.nan

review_df["review"] = review_df[["review"]].apply(
    lambda x: nlp_ja(x["review"]), axis=1
)

# 명사 추출 결과가 결측치인 행 제거
review_df = review_df.dropna(axis=0, how="any")

# 명사 추출 결과
print(f"[{lang}] - after", end=" ")
print(review_df.shape)
display(review_df.head(5))
review_df.to_csv(f"./dataset/preprocessed/nlp/{lang}.csv", index=False, encoding="utf-8-sig")

[japanese] - before (131979, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,84878383,japanese,1UI全般の使い勝手がかなり悪い 2西洋RPGとしての自由度が低い、マップの探索と成長システ...,False,1610923369,4
1,292030,The Witcher 3: Wild Hunt,84722763,japanese,ストーリーの量やキャラクターの個性が十分過ぎます。大体2時間ぐらいレベルだと10ぐらいで操作...,True,1610710969,1
2,292030,The Witcher 3: Wild Hunt,84531160,japanese,1,True,1610425484,1
3,292030,The Witcher 3: Wild Hunt,84403813,japanese,大人向け硬派なRPGの傑作 モンスタースレイヤー「ウィッチャー」となり、人々を助け、仲間を増...,True,1610250399,1
4,292030,The Witcher 3: Wild Hunt,84349385,japanese,JRPGしかやったことなく洋ゲーに手を出してみたい方におすすめですね,True,1610187216,1


[japanese] - after (112804, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,84878383,japanese,全般 勝手 西洋 自由 度 マップ 探索 成長 システム 単調 アクション 操作 戦闘 シス...,False,1610923369,4
1,292030,The Witcher 3: Wild Hunt,84722763,japanese,ストーリー 量 キャラクター 個性 大体 時間 レベル 操作 一気 込み 周 越え プレイ ...,True,1610710969,1
2,292030,The Witcher 3: Wild Hunt,84403813,japanese,大人 硬派 傑作 モンスター スレイヤー ウィッチャー 仲間 娘 剣 魔法 アクション とこ...,True,1610250399,1
3,292030,The Witcher 3: Wild Hunt,84349385,japanese,こと 洋 ゲー 手 方,True,1610187216,1
4,292030,The Witcher 3: Wild Hunt,84272946,japanese,洋 ゲー,True,1610089599,1


#### 3.2.3 tchinese

In [49]:
# 언어 설정
lang = "tchinese"

# 언어별 자연어 처리 패키지 설정
import jieba
import jieba.posseg as pseg

# 전처리된 리뷰 데이터 로드
print(f"[{lang}] - before", end=" ")
review_df = pd.read_csv(f"./dataset/preprocessed/default/{lang}.csv", encoding="utf-8-sig")
print(review_df.shape)
display(review_df.head(5))

# 언어별 명사 추출 함수
def nlp_tc(review):    
    # 명시적 형변환
    review = str(review)
    
    # 정규표현식을 활용한 문자 추출
    pattern = lang_data.unicode_pattern[lang]
    review = re.sub(pattern, "", review)
    
    # 명사 추출
    nouns_list = [w.word for w in pseg.cut(review) if w.flag in ["n", "f", "s", "nr", "ns", "nt", "nw", "nz"]]

    if nouns_list:
        return " ".join(nouns_list)
    else:
        # 필터링 결과가 공백인 경우 명시적으로 결측치 처리
        return np.nan

review_df["review"] = review_df[["review"]].apply(
    lambda x: nlp_tc(x["review"]), axis=1
)

# 명사 추출 결과가 결측치인 행 제거
review_df = review_df.dropna(axis=0, how="any")

# 명사 추출 결과
print(f"[{lang}] - after", end=" ")
print(review_df.shape)
display(review_df.head(5))
review_df.to_csv(f"./dataset/preprocessed/nlp/{lang}.csv", index=False, encoding="utf-8-sig")

[tchinese] - before (302175, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85146124,tchinese,"mother fucker,dont need to say",True,1611326262,1
1,292030,The Witcher 3: Wild Hunt,85103489,tchinese,fantastic game,True,1611256305,1
2,292030,The Witcher 3: Wild Hunt,85087359,tchinese,讚!!!!,True,1611233827,1
3,292030,The Witcher 3: Wild Hunt,85086695,tchinese,好玩 ~玩了就上癮,True,1611232883,1
4,292030,The Witcher 3: Wild Hunt,85073603,tchinese,一年前玩完一周目後，仍然會常常想起在巫師3世界中遊歷探險的美好回憶 一周目把問號全踩完，只可...,True,1611212953,1


[tchinese] - after (191855, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85073603,tchinese,前 後 巫師 世界 歷探險 問號 昆特 牌 時候 二周目,True,1611212953,1
1,292030,The Witcher 3: Wild Hunt,84906637,tchinese,遊戲 標桿 級別 遊戲 內容 前提 下 魔幻 世界 大量 細節 整體 世界 支線 任務 用心...,True,1610971238,1
2,292030,The Witcher 3: Wild Hunt,84903896,tchinese,世界,True,1610967149,1
3,292030,The Witcher 3: Wild Hunt,84824402,tchinese,世界 问题 剧情,True,1610849388,1
4,292030,The Witcher 3: Wild Hunt,84812134,tchinese,內容 瀾國 寶 白狼 尋女 路,True,1610831753,1


#### 3.2.4 schinese

In [51]:
# 언어 설정
lang = "schinese"

# 언어별 자연어 처리 패키지 설정
import jieba
import jieba.posseg as pseg

# 전처리된 리뷰 데이터 로드
print(f"[{lang}] - before", end=" ")
review_df = pd.read_csv(f"./dataset/preprocessed/default/{lang}.csv", encoding="utf-8-sig")
print(review_df.shape)
display(review_df.head(5))

# 언어별 명사 추출 함수
def nlp_sc(review):    
    # 명시적 형변환
    review = str(review)
    
    # 정규표현식을 활용한 문자 추출
    pattern = lang_data.unicode_pattern[lang]
    review = re.sub(pattern, "", review)
    
    # 명사 추출
    nouns_list = [w.word for w in pseg.cut(review) if w.flag in ["n", "f", "s", "nr", "ns", "nt", "nw", "nz"]]

    if nouns_list:
        return " ".join(nouns_list)
    else:
        # 필터링 결과가 공백인 경우 명시적으로 결측치 처리
        return np.nan

review_df["review"] = review_df[["review"]].apply(
    lambda x: nlp_sc(x["review"]), axis=1
)

# 명사 추출 결과가 결측치인 행 제거
review_df = review_df.dropna(axis=0, how="any")

# 명사 추출 결과
print(f"[{lang}] - after", end=" ")
print(review_df.shape)
display(review_df.head(5))
review_df.to_csv(f"./dataset/preprocessed/nlp/{lang}.csv", index=False, encoding="utf-8-sig")

[schinese] - before (5049618, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85185598,schinese,不玩此生遗憾，RPG游戏里的天花板，太吸引人了,True,1611381629,1
1,292030,The Witcher 3: Wild Hunt,85185250,schinese,拔DIAO无情打桩机--杰洛特!!!,True,1611381030,1
2,292030,The Witcher 3: Wild Hunt,85185111,schinese,巫师3NB,True,1611380800,1
3,292030,The Witcher 3: Wild Hunt,85184287,schinese,大作,True,1611379427,1
4,292030,The Witcher 3: Wild Hunt,85183227,schinese,年度最佳的作品，没啥好夸的，神作,True,1611377703,1


[schinese] - after (3690368, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85185598,schinese,游戏 里 天花板 人,True,1611381629,1
1,292030,The Witcher 3: Wild Hunt,85185250,schinese,无情 打桩机 杰,True,1611381030,1
2,292030,The Witcher 3: Wild Hunt,85185111,schinese,巫师,True,1611380800,1
3,292030,The Witcher 3: Wild Hunt,85184287,schinese,大作,True,1611379427,1
4,292030,The Witcher 3: Wild Hunt,85183227,schinese,年度 作品 神作,True,1611377703,1


#### 3.2.5 thai

In [53]:
# 언어 설정
lang = "thai"

# 언어별 자연어 처리 패키지 설정
import pythainlp
from pythainlp.tag import pos_tag

# 전처리된 리뷰 데이터 로드
print(f"[{lang}] - before", end=" ")
review_df = pd.read_csv(f"./dataset/preprocessed/default/{lang}.csv", encoding="utf-8-sig")
print(review_df.shape)
display(review_df.head(5))

# 언어별 명사 추출 함수
def nlp_th(review):    
    # 명시적 형변환
    review = str(review)
    
    # 정규표현식을 활용한 문자 추출
    pattern = lang_data.unicode_pattern[lang]
    review = re.sub(pattern, "", review)
    
    # icu engine를 활용한 토크나이징
    token_list = pythainlp.tokenize.word_tokenize(review, engine="icu")
    
    # 명사 추출
    noun_list = [x[0] for x in pos_tag(token_list,  corpus="pud") if x[1] == "NOUN"]

    if nouns_list:
        return " ".join(nouns_list)
    else:
        # 필터링 결과가 공백인 경우 명시적으로 결측치 처리
        return np.nan

review_df["review"] = review_df[["review"]].apply(
    lambda x: nlp_th(x["review"]), axis=1
)

# 명사 추출 결과가 결측치인 행 제거
review_df = review_df.dropna(axis=0, how="any")

# 명사 추출 결과
print(f"[{lang}] - after", end=" ")
print(review_df.shape)
display(review_df.head(5))
review_df.to_csv(f"./dataset/preprocessed/nlp/{lang}.csv", index=False, encoding="utf-8-sig")

[thai] - before (169125, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85105828,thai,หนุกมาก,True,1611259493,1
1,292030,The Witcher 3: Wild Hunt,85089487,thai,เนื้อเรื่องนี่ดีเยี่ยมครับ สมกับเกมส์ขึ้นหิ้ง\n,True,1611236686,1
2,292030,The Witcher 3: Wild Hunt,85072709,thai,เกมเพลย์\nตอนแรกผมรู้สึกขัด ๆ กับความเยอะเอวรี...,True,1611211218,1
3,292030,The Witcher 3: Wild Hunt,85003706,thai,เข้ามาครั้งแรกเลเวล1 ไปหาผีบ่อน้ำเลย โดนผีตบแร...,True,1611100161,1
4,292030,The Witcher 3: Wild Hunt,84965262,thai,เยี่ยมไปเลยฮ้าบบบบบ เดินตีไพ่กับ npc หนุกกว่าล...,True,1611046355,1


[thai] - after (102703, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85105828,thai,หนุ,True,1611259493,1
1,292030,The Witcher 3: Wild Hunt,85089487,thai,เนื้อ เรื่อง,True,1611236686,1
2,292030,The Witcher 3: Wild Hunt,85072709,thai,เกม ตอน สึก ความ รี่ ติง เกม เคว สห ยับ เยิน แ...,True,1611211218,1
3,292030,The Witcher 3: Wild Hunt,85003706,thai,ครั้ง ผี บ่อน้ำ ผี ตบ เกม ระบบ ต่อสู้ สุดท้าย ...,True,1611100161,1
4,292030,The Witcher 3: Wild Hunt,84965262,thai,บบบบบ ไพ่ หนุ ล่า มอ นอีกเคว สก็,True,1611046355,1


#### 3.2.5 english

In [56]:
# 언어 설정
lang = "english"

# 언어별 자연어 처리 패키지 설정
import nltk
from nltk.tokenize import TreebankWordTokenizer # 유니코드 대체 대신 토큰화->품사태깅 방식 차용
from nltk.stem import WordNetLemmatizer
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

# 전처리된 리뷰 데이터 로드
print(f"[{lang}] - before", end=" ")
review_df = pd.read_csv(f"./dataset/preprocessed/default/{lang}.csv", encoding="utf-8-sig")
print(review_df.shape)
display(review_df.head(5))

# 언어별 명사 추출 함수
def nlp_en(review):    
    # 명시적 형변환
    review = str(review)
    
    # 정규표현식을 활용한 문자 추출
    pattern = lang_data.unicode_pattern[lang]
    review = re.sub(pattern, "", review)
    
    # 소문자 전환
    review = review.lower()

    # 문장 토큰화
    token_list = tokenizer.tokenize(review)
    
    # 표제어 추출(lemmatization)
    lemma_list = [lemmatizer.lemmatize(x) for x in token_list]
    
    # 어간 추출(stemming)
    # nltk의 어간 추출은 단어 원형이 깨지는 경우가 있고, 많은 리뷰에서 일일히 스태머끼리 비교를 할 수 없기에 제외
    
    # 품사 태깅
    pos_list = nltk.pos_tag(lemma_list)
    
    # 명사 추출
    nouns_list = [x[0] for x in pos_list if x[1] in ["NN", "NNS", "NNP", "NNPS"]]
    
    if nouns_list:
        return " ".join(nouns_list)
    else:
        # 필터링 결과가 공백인 경우 명시적으로 결측치 처리
        return np.nan

review_df["review"] = review_df[["review"]].apply(
    lambda x: nlp_en(x["review"]), axis=1
)

# 명사 추출 결과가 결측치인 행 제거
review_df = review_df.dropna(axis=0, how="any")

# 명사 추출 결과
print(f"[{lang}] - after", end=" ")
print(review_df.shape)
display(review_df.head(5))
review_df.to_csv(f"./dataset/preprocessed/nlp/{lang}.csv", index=False, encoding="utf-8-sig")

[english] - before (14765410, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85184605,english,"One of the best RPG's of all time, worthy of a...",True,1611379970,1
1,292030,The Witcher 3: Wild Hunt,85184171,english,"good story, good graphics lots to do",True,1611379264,1
2,292030,The Witcher 3: Wild Hunt,85184064,english,"dis gud,",True,1611379091,1
3,292030,The Witcher 3: Wild Hunt,85180436,english,favorite game of all time cant wait for the Ne...,True,1611373086,1
4,292030,The Witcher 3: Wild Hunt,85179753,english,Why wouldn't you get this,True,1611371978,1


[english] - after (13498023, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85184605,english,rpg time worthy collection,True,1611379970,1
1,292030,The Witcher 3: Wild Hunt,85184171,english,story graphic lot,True,1611379264,1
2,292030,The Witcher 3: Wild Hunt,85184064,english,dis gud,True,1611379091,1
3,292030,The Witcher 3: Wild Hunt,85180436,english,game time wait nexgen versiion,True,1611373086,1
4,292030,The Witcher 3: Wild Hunt,85179341,english,worth,True,1611371318,1


#### 3.2.6 german

In [61]:
# 언어 설정
lang = "german"

# 언어별 자연어 처리 패키지 설정
from textblob_de import TextBlobDE

# 전처리된 리뷰 데이터 로드
print(f"[{lang}] - before", end=" ")
review_df = pd.read_csv(f"./dataset/preprocessed/default/{lang}2.csv", encoding="utf-8-sig")
print(review_df.shape)
display(review_df.head(5))

# 언어별 명사 추출 함수
def nlp_de(review):    
    # 명시적 형변환
    review = str(review)
    
    # 정규표현식을 활용한 문자 추출
    pattern = lang_data.unicode_pattern[lang]
    review = re.sub(pattern, "", review)
    
    # 소문자 전환
    review = review.lower()
    
    # 명사 추출
    pos_list = TextBlobDE(review).tags
    nouns_list = [w[0].lower() for w in pos_list if w[1] in ["NN", "NNS", "NNP", "NNPS"]]
    
    if nouns_list:
        return " ".join(nouns_list)
    else:
        # 필터링 결과가 공백인 경우 명시적으로 결측치 처리
        return np.nan

review_df["review"] = review_df[["review"]].apply(
    lambda x: nlp_de(x["review"]), axis=1
)

# 명사 추출 결과가 결측치인 행 제거
review_df = review_df.dropna(axis=0, how="any")

# 명사 추출 결과
print(f"[{lang}] - after", end=" ")
print(review_df.shape)
display(review_df.head(5))
review_df.to_csv(f"./dataset/preprocessed/nlp/{lang}2.csv", index=False, encoding="utf-8-sig")

[german] - before (1132888, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85170372,german,best game ever,True,1611357157,1
1,292030,The Witcher 3: Wild Hunt,85152206,german,cool,True,1611333436,1
2,292030,The Witcher 3: Wild Hunt,85151764,german,top,True,1611332948,1
3,292030,The Witcher 3: Wild Hunt,85147517,german,si,True,1611327896,1
4,292030,The Witcher 3: Wild Hunt,85131518,german,macht viel spaß und ist anfängerfreundlich,True,1611304864,1


[german] - after (1033582, 8)


Unnamed: 0,app_id,app_name,recommendationid,language,review,voted_up,timestamp_created,quadrant
0,292030,The Witcher 3: Wild Hunt,85170372,german,game ever,True,1611357157,1
1,292030,The Witcher 3: Wild Hunt,85152206,german,cool,True,1611333436,1
2,292030,The Witcher 3: Wild Hunt,85151764,german,top,True,1611332948,1
3,292030,The Witcher 3: Wild Hunt,85147517,german,si,True,1611327896,1
4,292030,The Witcher 3: Wild Hunt,85131518,german,spaß,True,1611304864,1
