In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.image as img
from matplotlib.image import imread
import matplotlib.font_manager as fm
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.patches import Rectangle

import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import unicodedata

import re

## Prompt Setting

## Make Word Counting

In [271]:
from sklearn.feature_extraction.text import CountVectorizer

In [272]:
prompts = pd.read_csv(path+'prompts.csv')
file_info = pd.read_csv(path+'file_info.csv')
file_info_prompts = pd.merge(file_info[['Path','new_date']], prompts[['Path','new_prompts']], how = 'left', on = 'Path')

In [273]:
prompt = list(file_info_prompts['new_prompts'][file_info_prompts['new_date']>=1500])
all_countv = CountVectorizer(binary=True).fit(prompt)
all_countv_matrix = all_countv.transform(prompt).toarray()
all_countv_voca = all_countv.vocabulary_

In [274]:
df_voca = pd.DataFrame(all_countv_voca.items(),columns=['word','index'])
df_voca = df_voca.sort_values('index').reset_index(drop=True)

In [275]:
years = list(range(1500, 2000, 10))

all_countv_freq = np.sum(all_countv_matrix, axis=0)
all_countv_voca_rev = {value: key for key, value in all_countv_voca.items()}
df_count_by_year = pd.DataFrame([[0]*len(years)]*len(all_countv_voca_rev),columns=years)

for year in tqdm(years) :
    prompt = file_info_prompts['new_prompts'][file_info_prompts['new_date']==year]
    countv = CountVectorizer(binary=True).fit(prompt)
    countv_freq = np.sum(countv.transform(prompt).toarray(), axis=0)

    temp = [0]*len(all_countv_voca_rev)
    for x in countv.vocabulary_.items() :
        temp[all_countv_voca[x[0]]]=countv_freq[x[1]]
    df_count_by_year[year] = temp

100%|███████████████████████████████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.00it/s]


In [276]:
print(f'total words : {len(df_voca)}')

total words : 21575


In [277]:
df_voca.to_csv(path+'words/vocas.csv',index=False)
df_count_by_year.to_csv(path+'words/vocas_counting.csv',index=False)

## Make Word Token

#### Prompt preprocessing ( style & artist del)

In [278]:
# Artist keyword
with open(path+'blip_artists.txt', "r") as tf:
    blip_artists = tf.read().split("\n")
blip_artists = [re.sub(r"[-,.,',\s+]", " ", i.lower()) for i in blip_artists]
artists = list()
for artist in blip_artists :
    artists.extend(axcii_del(artist).split())

In [279]:
styles = file_info['Style'].dropna().unique()
styles = [re.sub(r"\s+", " ", i.lower()) for i in styles]

In [301]:
results = list()
for text in tqdm(file_info_prompts['new_prompts']):

    # style del
    pattern = r'\b(' + '|'.join(map(re.escape, styles)) + r')\b'
    result = re.sub(pattern, '', text)
    result = re.sub(r'\s+', ' ', result).strip()
    
    # blip artist del
    pattern = r'\b(' + '|'.join(map(re.escape, artists)) + r')\b'
    result = re.sub(pattern, '', result)
    result = re.sub(r'\s+', ' ', result).strip()
    results.append(result)
    
file_info_prompts['new_prompt_tokenver'] = results

100%|████████████████████████████████████████████████████████████████████████████| 73814/73814 [10:48<00:00, 113.91it/s]


#### tf-idf and make token

In [302]:
from sklearn.feature_extraction.text import TfidfVectorizer
prompt = list(file_info_prompts['new_prompt_tokenver'][file_info_prompts['new_date']>=1500])
tfidfv = TfidfVectorizer(stop_words=None, min_df=1, max_df=1.0).fit(prompt)
tfidfv_matrix = tfidfv.transform(prompt).toarray()
tfidfv_voca = tfidfv.vocabulary_
tfidfv_voca_rev = {value: key for key, value in tfidfv_voca.items()}

In [303]:
idf_values = tfidfv.idf_
idf_dict = dict(zip(tfidfv.get_feature_names_out(), idf_values))
df_tfidfv_matrix = pd.DataFrame(tfidfv_matrix)

In [340]:
##### ver02 - select century

from collections import defaultdict

tfidf100 = dict()
for year in [1500+i*100 for i in range(5)] :
    years = [year+j*10 for j in range(10)]
    idxs = file_info_prompts[file_info_prompts['new_date'].isin(years)].index
    tfidf100[year] = df_tfidfv_matrix[df_tfidfv_matrix.index.isin(idxs)].sum()

df_tfidf100 = pd.DataFrame(tfidf100)
df_tfidf100 = df_tfidf100[(df_tfidf100[1500]!=0) | (df_tfidf100[1600]!=0) | (df_tfidf100[1700]!=0) | (df_tfidf100[1800]!=0) | (df_tfidf100[1900]!=0)]

tokens_temp = defaultdict(list)
for i in df_tfidf100.index :
    year = df_tfidf100.loc[i].idxmax()
    tfidf = df_tfidf100.loc[i][year]
    tokens_temp[year].append([i,tfidf])

In [429]:
tokens = dict()
for year in [1600+i*100 for i in range(4)] :
    temp = pd.DataFrame(tokens_temp[year],columns=['word_idx','tfidf'])
    temp['word'] = [ tfidfv_voca_rev[idx] for idx in temp['word_idx'] ]
    temp = temp.sort_values('tfidf',ascending=False)[:100]
    tokens[year] = temp['word'].values
    tokens[f'{year}_tfidf'] = temp['tfidf'].values

In [430]:
pd.DataFrame(tokens).to_csv(path+f'words/tokens_100.csv',index=False)

In [431]:
artist_names = [
    'lieven',  
    'vemeer',  
    'carvaggio', 
    'carravaggio',
    'wissing',
    'neoclassicist',
    'barocco',
    'rokoko', 
    'portraiit',
    'pitt',
    'abstract',
    'portrait',
    'landscape',
    'todorovitch',
    '1647',
    '1666',
    '1600s',
    '1614572159',
    '1615',
    '16384k',
    '1759',
    '1786560639',
    '2k',
    '3000',
    '40s',
    '640',    
]

In [432]:
df_tokens = pd.DataFrame(tokens)
tokens = dict()
for year in [1600,1700,1800,1900] :
    temp = df_tokens[~df_tokens[year].isin(artist_names)][:77]
    tokens[year] = temp[year].values
    tokens[f'{year}_tfidf'] = temp[f'{year}_tfidf'].values

In [435]:
with open(path+'words/tokens.pkl', 'wb') as file:
    pickle.dump(tokens, file)

In [436]:
pd.DataFrame(tokens).to_csv(path+f'words/tokens.csv',index=False)