### 篩選字數

In [2]:
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, BertConfig

In [2]:
# 原始資料有 13355023
df = pd.read_pickle('data/reviews.pkl')
df.drop(columns=['RefValue', 'VoteUp'], inplace=True)

# 分句
split_rev = []
for review in df['Review']:
    sentences = review.splitlines()
    sentences = list(filter(None, sentences))
    tmp = []
    for sent in sentences:
        sent = re.split(r' *[\.\?!][\'"\)\]]* *', sent)
        tmp.extend(sent)
    min_words_sent = 3
    sentences = list(filter(lambda x:len(x.split())>min_words_sent, tmp))
    split_rev.append(sentences)

df['SplitReview'] = split_rev
len(df)

13355023
13355023


In [None]:
# 刪除[]，剩下10026632。有 24954 Apps/ 4885508 users
empty_list = [ i for i,x in enumerate(df['SplitReview']) if x ==[] ]
df.drop(empty_list, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
len(df)

In [8]:
# 選擇類別
# select_cate 中，如果刪掉RPG，在filter_user_app 不管 threshold 怎麼設定都無法穩定
def select_app_category(df):
    select_cate = ['Indie', 'Action', 'Casual', 'Adventure', 'Simulation', 'Strategy', 'RPG']
    select_cate = select_cate[:-1]
    genre = pd.read_csv('data/app_review_summary.csv')
    # game_cate(eg 'Action,Adventure,Strategy')，split 後檢查是否有任一類別在 select_cate 中
    app_list = [ genre['AppID'][i] for i,game_cates in enumerate(genre['Genres']) 
                if ([cate for cate in game_cates.split(',') if cate in select_cate] != []) ]
    df = df[ df['AppID'].isin(app_list) ]

    return df

def filter_user_app(df, app_threshold, user_threshold):
    # 篩選評論數
    app_reviews = df['AppID'].value_counts()
    filter_apps = app_reviews[app_reviews >= app_threshold].index[:]
    df = df[ df['AppID'].isin(filter_apps) ]
    user_reviews = df['UserID'].value_counts()
    filter_users = user_reviews[user_reviews >= user_threshold].index[:]
    df = df[ df['UserID'].isin(filter_users) ]
    df.reset_index(drop=True, inplace=True)

    return df

filter_data = select_app_category(df)
len(filter_data)

9531685

In [9]:
# 反覆檢查使用者評論數、遊戲評論數是否平衡
app_threshold, user_threshold, max_count = 40, 30, 35
count, min_app = 0, 1
while (min_app < app_threshold) and (count < max_count):
    print(str(count)+"\r", end="")
    filter_data = filter_user_app(filter_data, app_threshold, user_threshold)
    min_app = min(filter_data['AppID'].value_counts())
    count += 1

print('Apps: ', len(filter_data['AppID'].unique()), 'Users: ', len(filter_data['UserID'].unique()))
print('total data: ', len(filter_data))
print(min(filter_data['AppID'].value_counts()), min(filter_data['UserID'].value_counts()))

Apps:  485 Users:  882
total data:  38162
36 30


In [6]:
# save file
filter_data['Like'] = filter_data['Like'].apply(lambda x : 0 if x == False else 1)
filter_data['Like'] = filter_data['Like'].astype(int)
filter_data['UserID'] = filter_data['UserID'].astype('int64')
filter_data.to_pickle('data/reviews_38162.pkl')

<font size=5> BERT Embedding </font>

In [3]:
df = pd.read_pickle('data/reviews_38162.pkl')
df.head()

Unnamed: 0,AppID,UserID,Like,Review,SplitReview
0,730,76561197969379991,1,wingman is fun with friends.,[wingman is fun with friends]
1,730,76561198118543045,0,"Awful, toxic community and annoying game play....","[Awful, toxic community and annoying game play..."
2,730,76561197971801273,0,After playing Counterstrike: Source for 12000+...,[After playing Counterstrike: Source for 12000...
3,730,76561198084359238,1,Lots of fun to be had with this one. A lot of ...,"[Lots of fun to be had with this one, A lot of..."
4,730,76561198123845513,0,[h1] This really doesn't have the feel of spor...,[[h1] This really doesn't have the feel of spo...


In [5]:
df = pd.read_pickle('data/reviews_38162.pkl')
device = "cuda" if torch.cuda.is_available() else "cpu"
configuration = BertConfig()
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True).to(device)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# BERT embedding
review_emb_list = []
# 句字中字數的 avg=16, std = 10
MAX_LENGTH = 25

model.eval()
for review in tqdm(df['SplitReview']):
    torch.cuda.empty_cache()
    sent_emb_list = []
    for sent in review:
        sent_encode = bert_tokenizer.encode_plus(
            sent,
            add_special_tokens=True,  # Add [CLS] and [SEP]
            return_attention_mask = True,
            max_length = MAX_LENGTH,
            truncation = True,
            padding = "max_length",
            return_tensors = 'pt'
        )
        with torch.no_grad():
            outputs = model(sent_encode["input_ids"].to(device), sent_encode["attention_mask"].to(device))

        # outputs[2][-1] 與 outputs[0] 是一樣的
        sent_emb = outputs[2][-1]
        # MAX_LENGTH * 768 -> 768
        sent_emb = torch.mean(sent_emb, dim=1)
        sent_emb_list.append(sent_emb)

    review_emb = torch.cat(sent_emb_list, 0)
    # mean 會把維度 1 的地方 squeeze。5*768 算完 mean 後是 768，而非 1*768
    review_emb = torch.mean(review_emb, dim=0)
    review_emb_list.append( np.array(review_emb, dtype=np.float32) )

100%|██████████| 38162/38162 [50:17<00:00, 12.65it/s]  


In [7]:
# save file
df["ReviewEmbedding"] = review_emb_list
df.to_pickle('data/review_embedding.pkl')
df.head()

Unnamed: 0,AppID,UserID,Like,Review,SplitReview,ReviewEmbedding
0,730,76561197969379991,1,wingman is fun with friends.,[wingman is fun with friends],"[tensor(-0.1944), tensor(-0.4107), tensor(0.20..."
1,730,76561198118543045,0,"Awful, toxic community and annoying game play....","[Awful, toxic community and annoying game play...","[tensor(0.1559), tensor(-0.0969), tensor(0.063..."
2,730,76561197971801273,0,After playing Counterstrike: Source for 12000+...,[After playing Counterstrike: Source for 12000...,"[tensor(-0.0666), tensor(-0.1512), tensor(0.57..."
3,730,76561198084359238,1,Lots of fun to be had with this one. A lot of ...,"[Lots of fun to be had with this one, A lot of...","[tensor(0.0143), tensor(-0.0916), tensor(0.084..."
4,730,76561198123845513,0,[h1] This really doesn't have the feel of spor...,[[h1] This really doesn't have the feel of spo...,"[tensor(0.1439), tensor(-0.0896), tensor(0.106..."
