In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import re
from tqdm import tqdm


import matplotlib.pyplot as plt
import matplotlib.image as img
from matplotlib.image import imread
import matplotlib.pyplot as plt

import seaborn as sns 
import os

In [None]:
pd.set_option('display.max_columns', None) 

In [None]:
df = pd.read_csv('datas/Art500k.csv')

## 0. EDA

In [None]:
print('Painting total :',len(df))

In [None]:
df_style_temp = df[['Style']]
df_style_temp['Style'] = df_style_temp['Style'].fillna('')
df_style_temp['new_style'] = df_style_temp['Style'].str.split(',')
df_style_temp = df_style_temp.explode('new_style')

print('Style period total :', df_style_temp['new_style'].nunique())

In [None]:
df_artist_temp = df[['author_name']]
df_artist_temp['author_name'] = df_artist_temp['author_name'].fillna('')
df_artist_temp['new_author_name'] = df_artist_temp['author_name'].str.split(',')
df_artist_temp = df_artist_temp.explode('new_author_name')

print('Artist total :', df_artist_temp['new_author_name'].nunique())

In [None]:
df_nationality_temp = df[['Nationality']]
df_nationality_temp['Nationality'] = df_nationality_temp['Nationality'].fillna('')
df_nationality_temp['new_nationality'] = df_nationality_temp['Nationality'].str.split(',')
df_nationality_temp = df_nationality_temp.explode('new_nationality')

print('Nationality total :', df_nationality_temp['new_nationality'].nunique())

In [None]:
counting = len(df[df['Date'].str.contains('|'.join(['c','C','s','S','-']), na=False)]) / len(df) * 100
print(f'Date estimate : {counting}%')

## 1. Path preprocessing

In [None]:
df_path = df.copy()

In [None]:
noFile = list()
for i in tqdm(df_path.index) :
    if os.path.isfile("/home/jinny/datas/art500k/"+df_path.Path[i])==False :
        noFile.append(i)
print(f'No File : {len(noFile)}')

In [None]:
df_path.drop(noFile, inplace=True)

In [None]:
print(f'path preprocessing : {len(df_path)}')

## 2. Date preprocessing

In [None]:
df_date = df_path.copy().reset_index(drop=True)

#### extract year

In [None]:
df_date['new_date1'] = df_date['painting_name'].str.extract(r' (1\d\d\d)$')
df_date['new_date1'] = df_date['new_date1'].fillna('0')

df_date['new_date2'] = df_date['Date'].str.extract(r'(1\d\d\d)$')
df_date['new_date2'] = df_date['new_date2'].fillna('0')

In [None]:
new_date = list()
for idx,val in enumerate(df_date[['new_date1','new_date2']].values) :
    if val[0]=='0' and val[1]=='0':
        new_date.append('')
    elif val[1]!='0':
        new_date.append(val[1])
    else :
        new_date.append(val[0])

In [None]:
df_date['new_date'] = new_date
df_date = df_date[df_date['new_date']!='']
df_date.drop(['new_date1','new_date2'],axis='columns',inplace=True)

#### using decade

In [None]:
df_date['new_date_detail'] = df_date['new_date']

In [None]:
def change_last_character(value):
    if len(value) > 0:
        return value[:-1] + '0'
    else:
        return value

df_date['new_date'] = df_date['new_date'].apply(change_last_character)

In [None]:
df_date = df_date.astype({'new_date':'int'})
df_date = df_date.astype({'new_date_detail':'int'})

#### using 1300 - 2000

In [None]:
df_date = df_date[df_date.new_date>=1300]
df_date = df_date[df_date.new_date_detail>=1300]

In [None]:
print(f'date preprocessing : {len(df_date)}')

## 3. Style preprocessing

In [None]:
df_style = df_date.copy()

In [None]:
drop_style = ['Byzantine',
                'Gothic',
                'Gongbi',
                'Ink and wash painting',
                'Korean Art',
                'Joseon Dynasty',
                'Japanese',
                'Shin-hanga',
                'Sōsaku hanga',
                'Ukiyo-e',
                'Yamato-e',
                'Kanō school style',
                'Nanga (Bunjinga)',
                'Nihonga',
                'Zen',
                'Ero guro',
                'Islamic Art',
                'Abbasid Period',
                'Ilkhanid',
                'Nas-Taliq',
                'Timurid Period',
                'Mughal',
                'Ottoman Period',
                'Safavid Period' ,
                'Native Art',
                'Yoruba',
                'Folk art' ,
                'Pre-Columbian Art',
                'Post-classic',
                'Dictionaries',
                'New Ink Painting',
                'Documentary photography' ,
                'Street Photography',
                'Ink and wash painting',
                'Sumi-e',
                'Safavid',
                'Japonism',
                'Indian']

In [None]:
df_style = df_style[~df_style.Style.str.contains('|'.join(drop_style),na=False)]

In [None]:
df_style['Style']=df_style['Style'].str.replace('/',',')
df_style['Style']=df_style['Style'].str.replace(';',',')
df_style['Style']=df_style['Style'].str.replace('\xa0',' ')

In [None]:
print(f'style preprocessing : {len(df_style)}')

## 4. Field Preprocessing

In [None]:
df_field = df_style.copy()
df_field.Field = df_field.Field.str.lower()

In [None]:
using_field = ["painting", "oil", "watercolor", "tempera", "drawing", "sketch", 
                                      "canvas", "etching", "pastel", "gouache", "printmaking", "lithograph", 
                                      "ink", "charcoal", "paper"]

In [None]:
drop_field = ['porcelain',
              'interior',
              'japanese',
              'oriental',
              'chinese',
              'fresco',
              'architecture',
              'installation',
              'ceiling',
              'sculpture',
              'digital',
              'photo',
              'oriental',
              'wall',
              'calligraphy',
              'furniture',
              'ukiyo',
              'woodcut',
              'masonite',
              'silk',
              'glass']

In [None]:
df_field = df_field[~df_field['Field'].str.contains('|'.join(drop_field),na=False)]
df_field = df_field[df_field['Field'].str.contains('|'.join(using_field),na=False)]

In [None]:
print(f'field preprocessing : {len(df_field)}')

## 5. Genre preprocessing

In [None]:
df_genre = df_field.copy()

In [None]:
drop_genre = [
    'bijinga',
    'graffiti',
    'installation',
    'sculpture',
    'tapestry',
    'photo',
    'advertisement',
    'performance'
]

In [None]:
df_genre = df_genre[~(df_genre.Genre.str.contains('|'.join(drop_genre),na=False))]

In [None]:
print(f'genre preprocessing : {len(df_genre)}')

## 5. Nationality preprocessing

In [None]:
df_nationality  = df_genre.copy()

In [None]:
drop_nationality = [
    "Indian", "Japanese", "Chinese", "South Korean", "Iranian", "Emirati",
    "Indonesian", "Iraqi", "Syrian", "Vietnamese", "Filipino", "Lebanese",
    "Qatari", "Ethiopian", "Sudanese", "Egyptian", "Libyan", "Cameroonian",
    "Nigerian", "Azerbaijani", "Georgian", "Armenian"
]

In [None]:
df_nationality = df_nationality[~(df_nationality.Nationality.str.contains('|'.join(drop_nationality),na=False))]

In [None]:
print(f'nationality preprocessing : {len(df_nationality)}')

## 6. Author preprocessing

In [None]:
df_author = df_nationality.copy()

#### name preprocessing

In [None]:
df_author.author_name = df_author.author_name.str.lower()

In [None]:
df_author = df_author.drop_duplicates(subset=['painting_name','author_name'])

In [None]:
# AXCII 문자만 남기기 + stopword 필터링
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import unicodedata

def normalize_text(input_text):
    # NFC 정규화를 통해 문자열을 정규화합니다.
    normalized = unicodedata.normalize('NFD', input_text)
    # 정규화된 문자열에서 비ASCII 문자를 필터링하여 ASCII 문자만 남깁니다.
    return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

temp = list()
for i in tqdm(df_author['author_name']) :
    word_tokens = word_tokenize(i)
    result = []
    for word in word_tokens: 
        result.append(normalize_text(word))
    temp.append(' '.join(result))

In [None]:
df_author['author_name'] = temp

In [None]:
print(f'author preprocessing : {len(df_author)}')

## 7. Shape preprocessing

In [None]:
import cv2

#### shape check

In [None]:
%%time
img_shape = []
img_shape_no = []
for i in tqdm(df_author.Path):
    img = cv2.imread('/home/jinny/datas/art500k/'+i)
    if type(img) is type(None) : 
        img_shape.append([0,0,i])
        img_shape_no.append(i)
        continue
    h, w, c = img.shape
    img_shape.append([h,w,i])

In [None]:
df_shape = pd.merge(df_author, pd.DataFrame(img_shape,columns=['h','w','Path']), on = 'Path', how = 'left')
df_shape = df_shape[df_shape['h']!=0]
df_shape = df_shape[df_shape['w']!=0]
df_shape['r'] = [h / w if h > w else w / h for h, w in zip(df_shape['h'], df_shape['w'])]
df_shape['size'] = df_shape['h']*df_shape['w']

#### shape preprocessing

In [None]:
ratio2 = len(df_shape[df_shape['r'] >= 2])
under512 = len(df_shape[df_shape['size']<=410*410])
print('ratio 2 over :',ratio2/len(df_shape)*100)
print('size 512x512 under :',under512/len(df_shape)*100) # 512의 약 80%

In [None]:
print(f'shape preprocessed data : {(ratio2+under512)}')

In [None]:
df_shape = df_shape[df_shape['r'] < 2]
df_shape = df_shape[df_shape['size'] > 410*410]
df_shape = df_shape.reset_index(drop=True)

In [None]:
print(f'shape preprocessing : {len(df_shape)}')

#### resize test

In [None]:
import cv2

def resize_cv2(image, target_size=(512, 512)):
    return cv2.resize(image, target_size, interpolation=cv2.INTER_LANCZOS4)  # 고품질 보간

In [None]:
path = '/home/jinny/datas/art500k/'
sample = df_shape

In [None]:
image = cv2.imread(path+sample.iloc[0]['Path'])
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
plt.imshow(image)
plt.show()
resized_image = resize_cv2(image, (512, 512))
plt.imshow(resized_image)
plt.show()

## 8. Save

In [None]:
df_shape.to_csv('datas/file_info.csv',index=False)

In [None]:
prep_keyword = dict()
prep_keyword['style'] = drop_style
prep_keyword['field_d'] = drop_field
prep_keyword['field_u'] = using_field
prep_keyword['genre'] = drop_genre
prep_keyword['nationality'] = drop_nationality

In [None]:
pd.DataFrame.from_dict(prep_keyword, orient='index').to_csv('graph/supple/prep_keyword.csv',index=False)

-------------------------------------------------------------------------

## newstyle ( style sep )

#### style sep

In [None]:
df_newstyle = pd.read_csv('datas/file_info.csv')

In [None]:
df_newstyle['Style'] = df_newstyle['Style'].fillna('')
df_newstyle['new_style'] = df_newstyle['Style'].str.split(',')
df_newstyle['weight']=df_newstyle['new_style'].apply(len)
df_newstyle['weight']=1/df_newstyle['weight']

df_newstyle = df_newstyle.explode('new_style')
df_newstyle.loc[df_newstyle['new_style']=='','new_style'] = 'nostyle'

#### style peak

In [None]:
df_peak = df_newstyle.drop_duplicates(subset='new_style')[['new_style']]
df = df_newstyle.groupby(['new_date','new_style'])['painting_name'].count().reset_index()
df_peak = pd.merge(df_peak,df.sort_values(by='painting_name',ascending=False).drop_duplicates(subset='new_date')[['new_date','new_style']], how='left', on='new_style')
df_peak = pd.merge(df_peak,df.sort_values(by='painting_name',ascending=False).drop_duplicates(subset='new_date')[['new_date','new_style']], how='left', on='new_style')
df_peak = df_peak.rename(columns={'new_date_x':'peak_by_year','new_date_y':'peak_by_style'})
df_peak = df_peak.fillna(0)
df_peak['peak_by_year'] = df_peak['peak_by_year'].astype(int)
df_peak['peak_by_style'] = df_peak['peak_by_style'].astype(int)

In [None]:
df_newstyle.to_csv('datas/df_newstyle.csv',index=False)
df_peak.to_csv('datas/df_peak.csv',index=False)