In [None]:
from typing import Union, Tuple, List
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast
import numpy as np
import random
import pandas as pd
from datetime import datetime, date
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score
from tqdm import tqdm
# from IPython.display import Image
from joblib import Parallel, delayed
import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch
from transformers import BertTokenizer, BertModel
import re
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)
from torch.utils.data.dataset import Dataset
import matplotlib.pyplot as plt
from transformers import DataCollatorForLanguageModeling
import os
from transformers import Trainer, TrainingArguments
from transformers.utils import logging
logger = logging.get_logger(__name__)
from filelock import FileLock
import time
import pickle

In [None]:
review_df = pd.read_csv('/opt/ml/wine/data/review_df_total.csv',encoding = 'utf-8-sig').loc[:,['user_url','rating','text','wine_url']]

In [None]:
import json
with open('/opt/ml/wine/code/data/feature_map/item2idx.json','r') as f:
    item2idx = json.load(f)

In [None]:
review_df = review_df[review_df['text'].isna()==False]

In [None]:
review_df['text'] = review_df['text'].apply(lambda x: x + '.' if x[-1] != '.' else x)

In [None]:
def keep_english_and_digits(text):
    # Remove any characters that are not English alphabets, digits, periods, or commas at the end of sentences
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,]', '', text)
    return clean_text

In [None]:
def merge_text(data):
    return '\n'.join(data)


In [None]:
review_df['text'] = review_df['text'].apply(keep_english_and_digits)

In [None]:
review_df['wine_id'] = review_df['wine_url'].map(item2idx)
review_df = review_df[review_df['wine_id'].isna()==False]
review_df['wine_id'] = review_df['wine_id'].astype('int').astype('category')

In [None]:
wine_df = pd.read_csv('/opt/ml/wine/data/wine_df.csv')

In [None]:
note_df = wine_df.filter(like='_child')

In [None]:
notes = {}
import ast
def str2dict(x):
    try: return ast.literal_eval(x)
    except: return {}
def get_keys(x):
    return set(x.keys())

for col in note_df.columns:
    note_df.loc[:,col] = note_df.loc[:,col].apply(str2dict)
    sub_note = set()
    for i in tqdm(range(len(note_df))):
        subs = get_keys(note_df[col][i])
        sub_note = sub_note | subs
    notes[col.replace('_child','')] = sub_note

In [39]:
text_with_note = review_df.drop(['rating','wine_id'], axis = 1)

In [None]:
def check_note_in_review(text, notes_data):
    text = text.lower()
    result = []
    for key in notes:
        if any(word in text for word in notes[key]):
            result.append(1)
        else: result.append(0)
    return result

def marking_data(df, notes_data):
    df.reset_index(inplace = True)
    note_df = []
    for i in tqdm(range(len(df))):
        note_onehot = check_note_in_review(df.loc[i,'text'], notes_data)
        note_df.append(note_onehot)
    
    note_df = pd.DataFrame(note_df, columns = notes_data.keys())
    merged_df = pd.concat([df, note_df], axis=1)

    return 


def parallel_dataframe_2input(func, df, notes_data, num_cpu):

    chunks = np.array_split(df, num_cpu)

    print('Parallelizing with ' +str(num_cpu)+'cores')
    with Parallel(n_jobs = num_cpu, backend="multiprocessing") as parallel:
        results = parallel(delayed(func)(chunks[i], notes_data) for i in range(num_cpu))

    for i,data in enumerate(results):
        if i == 0:
            output = data
        else:
            output += data

    return output

In [42]:
text_with_note

Unnamed: 0,text,Red_Fruit,Tropical,Tree_Fruit,Oaky,Ageing,Black_Fruit,Citrus,Dried_Fruit,Earthy,Floral,Microbio,Spices,Vegetal
0,"Loud nose, smoke tar and tobacco. High tannin...",0,0,0,0,0,0,0,0,0,0,0,0,0
1,"LastBottle roomfilling kirsch, and dried cherr...",0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Pale ruby. Pronounced intensity. Raspberry, ch...",0,0,0,0,0,0,0,0,0,0,0,0,0
3,Another single vineyard Barolo from Manzone. 3...,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Bold Barolo with a softer middle and a dry fin...,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7649952,Better second day. Fruit is a bit more muted a...,0,0,0,0,0,0,0,0,0,0,0,0,0
7649953,"A bite on the back end. Black cherry, licorice...",0,0,0,0,0,0,0,0,0,0,0,0,0
7649954,"Solid merlot with smooth tannins, smoke, leath...",0,0,0,0,0,0,0,0,0,0,0,0,0
7649955,American oak prominent. Good acidity. Warm and...,0,0,0,0,0,0,0,0,0,0,0,0,0
