In [95]:
!pip install transformers



In [96]:
!pip install adabelief_pytorch



In [97]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [98]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [99]:
import os
import re
import numpy as np
from glob import glob
import json
import requests
import tensorflow as tf
from transformers import BertModel, TFBertModel, TFRobertaModel, RobertaTokenizer, BertTokenizerFast, AlbertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoModelForSequenceClassification
import pandas as pd
import matplotlib.pyplot as plt
from adabelief_pytorch import AdaBelief
from transformers.optimization import get_cosine_schedule_with_warmup
from tqdm import tqdm, tqdm_notebook
import shutil
import gc
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from tqdm.notebook import tqdm
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold


In [100]:
L_RATE = 1e-5
MAX_LEN = 45
max_grad_norm=1
log_interval=200
NUM_CORES = os.cpu_count()
device = torch.device("cuda:0")

In [101]:
class TestDataset(Dataset):
    def __init__(self, df,tokenizer):
        self.df_data = df
        self.tokenizer = tokenizer
    def __getitem__(self, index):
        # get the sentence from the dataframe
        sentence = self.df_data.loc[index, 'data']
        encoded_dict = self.tokenizer(
          text = sentence,
          add_special_tokens = True, 
          max_length = MAX_LEN,
          pad_to_max_length = True,
          truncation=True,           # Pad & truncate all sentences.
          return_tensors="pt")

        padded_token_list = encoded_dict['input_ids'][0]
        token_type_id = encoded_dict['token_type_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        sample = (padded_token_list, token_type_id , att_mask)
        return sample
    def __len__(self):
        return len(self.df_data)

class NewsSubjectClassifier(nn.Module):
  def __init__(self, n_classes):
    super(NewsSubjectClassifier, self).__init__()
    self.bert = AlbertModel.from_pretrained("kykim/albert-kor-base")
    self.drop = nn.Dropout(p=0.5)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)
  
class NewsSubjectDataset(Dataset):
  def __init__(self, subjects, targets, tokenizer, max_len):
    self.subjects = subjects
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.subjects)
  def __getitem__(self, item):
    subject = str(self.subjects[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      subject,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding = 'max_length',
      truncation = True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'subject_text': subject,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

def model_load(model_location, bert_or_albert):

  if bert_or_albert == 'bert':
    tokenizer_albert_kor_base = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
  elif bert_or_albert == 'albert':
    tokenizer_albert_kor_base = BertTokenizerFast.from_pretrained("kykim/albert-kor-base")
  else:
    print('실행 불가')
    return
  device = torch.device("cuda:0")
  model = torch.load(model_location)
  return model


def model_test(data, model, model_type):#데이터 모델 모델타입순 모델타입으론 roberta, electra 사용
    preds = [] 
    if model_type == 'roberta':
        tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large", cache_dir='bert_ckpt', do_lower_case=False)
    elif model_type == 'electra':   
        tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", cache_dir='bert_ckpt', do_lower_case=False)
    else:
        print("error")
        return 0
    test_data = TestDataset(data,tokenizer)
    test_dataloader = torch.utils.data.DataLoader(test_data,shuffle=False,num_workers=NUM_CORES)
    model.eval()
    torch.set_grad_enabled(False)
    for batch_id, (input_id,token_type_id,attention_mask) in enumerate(tqdm(test_dataloader)):
        input_id = input_id.long().to(device)
        token_type_id = token_type_id.long().to(device)
        attention_mask = attention_mask.long().to(device)
        outputs = model(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)
        out = outputs[0]
        for inp in out:
            preds.append(inp.detach().cpu().numpy())
    Preds = np.array(preds)
    return Preds

def model_use(data, model, bert_or_albert):
    model.eval()
    if bert_or_albert == 'bert':
        tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
    elif bert_or_albert == 'albert':
        tokenizer = BertTokenizerFast.from_pretrained("kykim/albert-kor-base")
    else:
        print("error")
        return 0
    
    ds = NewsSubjectDataset(
        subjects=data.to_numpy(),
        targets=np.zeros(len(data)),
        tokenizer=tokenizer,
        max_len=32
        )
    dl = DataLoader(ds)
    device = torch.device("cuda:0")
    
    subject_texts = []
    predictions = []
    prediction_probs = []
    with torch.no_grad():
        for d in tqdm(dl):
            texts = d["subject_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            subject_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()

    return prediction_probs

#모델생성
model_location = "/content/drive/My Drive/3조/DS_folder/jupyter notebook/Data/model/"###드라이브 위치 지정
model_albert = model_load(model_location + 'model_albert_kor_base.pth', 'albert')
model_bert = model_load(model_location + 'model_bert_kor_base.pth', 'bert').cuda()
model_Roberta = torch.load(model_location+'Roberta_329.pt')
model_KoELECTRA_113 = torch.load(model_location+'KoELECTRA_113.pt')
model_Roberta_328 = torch.load(model_location+'model_roberta_328.pth')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


In [203]:
# 데이터 입력
data_location = "/content/drive/My Drive/3조/DS_folder/jupyter notebook/Data/"

song_row = pd.read_csv(data_location + "song/songChart.csv", index_col=0)
song = song_row[['lyrics']]
song.rename({'lyrics':'data'}, axis=1, inplace=True)

novel = pd.read_csv(data_location + "novel/novel_total_final.csv")

story = novel[['story']]
story.rename({'story':'data'}, axis=1, inplace=True)

all = novel[['story', 'review', 'piece']]
all.fillna('', inplace=True)
all['data'] = all['story'] + all['review'] + all['piece']
all.drop(['story', 'review', 'piece'], axis=1, inplace=True)

color = pd.read_csv(data_location + "color/colors.csv", index_col=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


가사 : lyrics

In [103]:
albert_lyrics = model_use(song, model_albert, bert_or_albert='albert')
bert_lyrics = model_use(song, model_bert, bert_or_albert = 'bert')
roberta_329_lyrics = model_test(song, model_Roberta,'roberta')
electra_lyrics = model_test(song, model_KoELECTRA_113,'electra')
roberta_328_lyrics = model_test(song, model_Roberta_328,'roberta')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/1029 [00:00<?, ?it/s]



  0%|          | 0/1029 [00:00<?, ?it/s]



  0%|          | 0/1029 [00:00<?, ?it/s]



In [138]:
total_lyrics = 0.1 * albert_lyrics + 0.6 * bert_lyrics + 0.7 * roberta_329_lyrics + 0.1 * electra_lyrics + 0.3 * roberta_328_lyrics
total_lyrics

tensor([[-1.8690, -1.4631,  5.5460, -0.5388,  1.1711, -1.6434],
        [-2.6258, -1.2032,  3.3391,  0.0059,  1.3008, -0.3640],
        [ 2.4701, -2.3718,  1.6311, -0.5484,  0.6246, -2.1644],
        ...,
        [-2.0750, -1.4682,  3.4152,  0.1170,  1.6216, -1.4194],
        [-0.0260, -1.0443,  2.1726, -0.0969,  0.0769, -1.2987],
        [ 1.5971, -1.6248,  3.0572, -0.2630, -0.1827, -2.9261]])

책소개 : story

In [105]:
albert_story = model_use(story, model_albert, bert_or_albert='albert')
bert_story = model_use(story, model_bert, bert_or_albert = 'bert')
roberta_329_story = model_test(story, model_Roberta,'roberta')
electra_story = model_test(story, model_KoELECTRA_113,'electra')
roberta_328_story = model_test(story, model_Roberta_328,'roberta')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


  0%|          | 0/1118 [00:00<?, ?it/s]

  0%|          | 0/1118 [00:00<?, ?it/s]

  0%|          | 0/1118 [00:00<?, ?it/s]



  0%|          | 0/1118 [00:00<?, ?it/s]



  0%|          | 0/1118 [00:00<?, ?it/s]



In [139]:
total_story = 0.1 * albert_story + 0.6 * bert_story + 0.7 * roberta_329_story + 0.1 * electra_story + 0.3 * roberta_328_story
total_story

tensor([[ 1.2052, -2.9530,  0.7607, -1.8115,  0.0334,  2.2351],
        [ 1.1992, -2.9403,  0.7695, -1.8336,  0.0199,  2.2435],
        [ 1.7748, -2.2971,  1.0467, -1.5268,  1.2262, -1.1644],
        ...,
        [ 3.9567, -2.6445,  1.1177, -2.0727, -0.1051, -1.3132],
        [-2.8161, -2.3280,  2.8055, -1.0780,  2.6980,  0.5799],
        [ 1.5693, -1.2741,  0.3965, -0.9171, -0.2661, -0.2053]])

책소개 + 서평 + 책내용일부 : all

In [107]:
albert_all = model_use(all, model_albert, bert_or_albert='albert')
bert_all = model_use(all, model_bert, bert_or_albert = 'bert')
roberta_329_all = model_test(all, model_Roberta,'roberta')
electra_all = model_test(all, model_KoELECTRA_113,'electra')
roberta_328_all = model_test(all, model_Roberta_328,'roberta')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


  0%|          | 0/1118 [00:00<?, ?it/s]

  0%|          | 0/1118 [00:00<?, ?it/s]

  0%|          | 0/1118 [00:00<?, ?it/s]



  0%|          | 0/1118 [00:00<?, ?it/s]



  0%|          | 0/1118 [00:00<?, ?it/s]



In [140]:
total_all = 0.1 * albert_all + 0.6 * bert_all + 0.7 * roberta_329_all + 0.1 * electra_all + 0.3 * roberta_328_all
total_all

tensor([[ 1.2052, -2.9530,  0.7607, -1.8115,  0.0334,  2.2351],
        [ 1.1992, -2.9403,  0.7695, -1.8336,  0.0199,  2.2435],
        [ 1.7748, -2.2971,  1.0467, -1.5268,  1.2262, -1.1644],
        ...,
        [ 3.9567, -2.6445,  1.1177, -2.0727, -0.1051, -1.3132],
        [-2.8161, -2.3280,  2.8055, -1.0780,  2.6980,  0.5799],
        [ 1.5693, -1.2741,  0.3965, -0.9171, -0.2661, -0.2053]])

감성배색 : color

In [112]:
albert_color = model_use(color, model_albert, bert_or_albert='albert')
bert_color = model_use(color, model_bert, bert_or_albert = 'bert')
roberta_329_color = model_test(color, model_Roberta,'roberta')
electra_color = model_test(color, model_KoELECTRA_113,'electra')
roberta_328_color = model_test(color, model_Roberta_328,'roberta')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


  0%|          | 0/1529 [00:00<?, ?it/s]

  0%|          | 0/1529 [00:00<?, ?it/s]

  0%|          | 0/1529 [00:00<?, ?it/s]



  0%|          | 0/1529 [00:00<?, ?it/s]



  0%|          | 0/1529 [00:00<?, ?it/s]



In [141]:
total_color = 0.1 * albert_color + 0.6 * bert_color + 0.7 * roberta_329_color + 0.1 * electra_color + 0.3 * roberta_328_color
total_color

tensor([[ 2.0933, -1.5250, -0.4180, -0.9976, -0.5964,  0.4777],
        [ 2.2190, -1.4676, -0.4928, -1.0759, -0.6416,  0.4623],
        [ 2.2323, -1.5186, -0.4799, -1.1084, -0.6011,  0.4868],
        ...,
        [-0.6459,  0.9303, -0.1608,  0.4487, -0.2980, -0.3490],
        [-0.6165,  0.9137, -0.1689,  0.4453, -0.2954, -0.3533],
        [-0.6112,  0.9243, -0.1744,  0.4291, -0.3189, -0.3529]])

---

In [215]:
lyrics_ls = []
for lyrics in total_lyrics:
  lyrics_ls.append(torch.Tensor(lyrics))
song['vector'] = lyrics_ls

story_ls = []
for s in total_story:
  story_ls.append(torch.Tensor(s))
story['vector'] = story_ls

all_ls = []
for a in total_all:
  all_ls.append(torch.Tensor(a))
all['vector'] = all_ls

color_ls = []
for c in total_color:
  color_ls.append(torch.Tensor(c))
color['vector'] = color_ls

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [204]:
song.reset_index(inplace=True)
story.reset_index(inplace=True)
all.reset_index(inplace=True)
color.reset_index(inplace=True)

In [208]:
song.rename({'index':'id'}, axis=1, inplace=True)
story.rename({'index':'id'}, axis=1, inplace=True)
all.rename({'index':'id'}, axis=1, inplace=True)
color.rename({'index':'id'}, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [222]:
# cosSimilarity.py

from tqdm import tqdm
import operator

def cos_similarity(v1, v2):
  dot_product = np.dot(v1, v2)
  norm = (np.sqrt(sum(np.square(v1))) * np.sqrt(sum(np.square(v2))))
  similarity = dot_product / norm

  return similarity

def novel_to_sing(novel_df, song_df):
  v1 = novel_df['vector']
  v2 = song_df['vector']

  story_to_song = pd.DataFrame(columns=['novel', 'song'])

  for n in tqdm(range(len(v1))):

    cos_sim_dict = {}
    for i in range(len(v2)):
      cos_sim_dict[i] = cos_similarity(v1[n], v2[i])
    cos_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1), reverse=True)

    cos_ls_20 = [idx for idx, tensor in cos_dict[:20]]
    for idx, cos in enumerate(cos_ls_20):
      story_to_song = story_to_song.append({'novel':novel_df['id'][n],'song':song_df['id'][cos]}, ignore_index=True)

  return story_to_song

def novel_to_color(novel_df, color_df):
  v1 = novel_df['vector']
  v2 = color_df['vector']

  story_to_color = pd.DataFrame(columns=['novel', 'color'])

  for n in tqdm(range(len(v1))):

    cos_sim_dict = {}
    for i in range(len(v2)):
      cos_sim_dict[i] = cos_similarity(v1[n], v2[i])
    cos_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1), reverse=True)

    cos_ls_20 = [idx for idx, tensor in cos_dict[:10]]
    for idx, cos in enumerate(cos_ls_20):
      story_to_color = story_to_color.append({'novel':novel_df['id'][n],'color':color_df['id'][cos]}, ignore_index=True)

  return story_to_color

In [None]:
story_to_song = novel_to_sing(story, song)
story_to_color = novel_to_color(story, color)

 52%|█████▏    | 578/1118 [01:45<01:39,  5.44it/s]

In [219]:
story_to_song

Unnamed: 0,novel,song
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
22355,1117,15
22356,1117,16
22357,1117,17
22358,1117,18


In [220]:
story_to_color

Unnamed: 0,novel,color
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
22355,1117,15
22356,1117,16
22357,1117,17
22358,1117,18
