In [62]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import os
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.transforms as transforms
from torchinfo import summary
import torchvision.models as models

In [63]:
file = pd.read_csv('./data/data_v2.csv', sep='|')
file.head()

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,Two young guys with shaggy hair look at their ...
1,1000092795.jpg,1,Two young white males are outside near many b...
2,1000092795.jpg,2,Two men in green shirts are standing in a yard
3,1000092795.jpg,3,A man in a blue shirt standing in a garden
4,1000092795.jpg,4,Two friends enjoy time spent together


In [64]:
# 토큰 모델 지정
from nltk import tokenize

voca_dict = {}
num = 2
for idx in range(file.shape[0]):
    result = tokenize.word_tokenize(file.iloc[idx][2])
    # print(result)
    for word in result:
        if len(word) >= num :
            if voca_dict.get(word) : voca_dict[word] +=1
            else : voca_dict[word] = 1

print(voca_dict)



In [65]:
# 불용어 제거
# nltk.download('stopwords')   # nltk의 불용어 데이터 다운로드
from nltk.corpus import stopwords

stop_list = stopwords.words('english')   # 영어 불용어

list_for_pop = []
for k, v in voca_dict.items():
    if k in stop_list:
        list_for_pop.append(k)
print(f"제거될 단어 개수 : {len(list_for_pop)}")

print(f"제거 전 : {len(voca_dict.keys())}")
for k in list_for_pop:
    voca_dict.pop(k)
print(f"제거 후 : {len(voca_dict.keys())}")

제거될 단어 개수 : 121
제거 전 : 21771
제거 후 : 21650


In [66]:
voca_dict = dict(sorted(voca_dict.items(), key=lambda x: x[1], reverse=True))
voca_dict

{'man': 40121,
 'woman': 21193,
 'Two': 17253,
 'wearing': 15699,
 'people': 14011,
 'white': 13069,
 'shirt': 12981,
 'black': 12101,
 'young': 11978,
 'blue': 11287,
 'red': 9896,
 'sitting': 9610,
 'girl': 9318,
 'boy': 9142,
 'standing': 9105,
 'dog': 8921,
 'men': 8757,
 'playing': 8616,
 'street': 7922,
 'front': 7601,
 'group': 7506,
 'walking': 7330,
 'holding': 6981,
 'water': 5943,
 'The': 5612,
 'one': 5493,
 'green': 5208,
 'looking': 5053,
 'child': 4920,
 'An': 4906,
 'women': 4875,
 'outside': 4690,
 'large': 4656,
 'Three': 4652,
 'yellow': 4537,
 'two': 4389,
 'little': 4375,
 'brown': 4351,
 'person': 4166,
 'hat': 4037,
 'ball': 3891,
 'children': 3825,
 'next': 3748,
 'small': 3600,
 'dressed': 3455,
 'another': 3371,
 'running': 3366,
 'building': 3359,
 'jacket': 3348,
 'People': 3326,
 'riding': 3307,
 'around': 3131,
 'orange': 3066,
 'near': 3012,
 'field': 2979,
 'stands': 2935,
 'beach': 2900,
 'crowd': 2888,
 'background': 2881,
 'pink': 2859,
 'behind': 282

In [67]:
voca_DF = pd.Series(voca_dict).to_frame()
voca_DF

Unnamed: 0,0
man,40121
woman,21193
Two,17253
wearing,15699
people,14011
...,...
straightbacked,1
Carpenter,1
majestically,1
scrolled,1


In [68]:
# 최종 단어 사전 완성

total_voca_dict = {0 : '<UNK>', 1 : '<PAD>'}

for idx in range(len(voca_DF)):
    total_voca_dict[idx+2] = voca_DF.index[idx]
print(total_voca_dict)



In [69]:
len(total_voca_dict.keys())

21652

In [70]:
# 인코딩

caption_list = []  
count = 1
for i in range(file.shape[0]):
    # 띄어쓰기 단위로 분리 후 공백은 제거함
    result = file.iloc[i][2].split(' ')
    result = list(filter(None, result))

    # 각 단어마다 인코딩 진행
    sentence = []
    for word in result:
        word = word.lower()  # 소문자로 변경
        sentence.append(0)
        for k, v in total_voca_dict.items():
            if v == word:
                sentence[-1] = k
                break
    caption_list.append(sentence)
    count +=1
    if count % 5000 == 0 : print(count, end = ' ')

print('done')

5000 10000 15000 20000 25000 30000 35000 40000 45000 50000 55000 60000 65000 70000 75000 80000 85000 90000 95000 100000 105000 110000 115000 120000 125000 130000 135000 140000 145000 150000 155000 done


In [71]:
# 패딩

# 가장 긴 단어의 길이를 구함
max_length = 0
for cap in caption_list:
    if max_length < len(cap) :
        max_length = len(cap)
print(max_length) 

78


In [72]:
# 패딩값 추가함 
for k in range(len(caption_list)):
    len_result = len(caption_list[k])
    print(len_result)
    pad_num = max_length - len_result
    for pad in range(pad_num):
        caption_list[k].append(1)   # <PAD> : 1
    print(len(caption_list[k]))
    print()

16
78

9
78

10
78

10
78

6
78

11
78

11
78

9
78

8
78

6
78

17
78

12
78

9
78

8
78

7
78

16
78

14
78

12
78

10
78

8
78

16
78

9
78

10
78

8
78

6
78

16
78

14
78

8
78

8
78

5
78

15
78

12
78

10
78

8
78

8
78

14
78

13
78

12
78

10
78

7
78

16
78

12
78

13
78

11
78

6
78

14
78

10
78

10
78

9
78

7
78

16
78

10
78

10
78

9
78

9
78

22
78

15
78

15
78

13
78

12
78

18
78

14
78

12
78

8
78

9
78

17
78

15
78

15
78

12
78

14
78

13
78

10
78

11
78

7
78

4
78

19
78

16
78

11
78

11
78

9
78

20
78

19
78

13
78

12
78

9
78

17
78

14
78

11
78

11
78

12
78

20
78

16
78

9
78

11
78

9
78

17
78

13
78

15
78

15
78

11
78

13
78

11
78

13
78

9
78

11
78

18
78

16
78

10
78

8
78

6
78

26
78

13
78

11
78

8
78

8
78

11
78

10
78

7
78

6
78

6
78

19
78

19
78

19
78

13
78

16
78

12
78

11
78

11
78

9
78

8
78

28
78

20
78

18
78

15
78

8
78

13
78

11
78

7
78

9
78

9
78

11
78

12
78

11
78

7
78

7
78

14
78

14
78

8
78

10
78

8
78


In [73]:
caption_df = pd.DataFrame(caption_list)
caption_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,76,77
0,37,10,288,0,2075,81,147,0,0,121,...,1,1,1,1,1,1,1,1,1,1
1,37,10,7,662,0,33,55,273,1379,1,...,1,1,1,1,1,1,1,1,1,1
2,37,18,0,28,209,0,16,0,0,438,...,1,1,1,1,1,1,1,1,1,1
3,0,2,0,0,11,8,16,0,0,656,...,1,1,1,1,1,1,1,1,1,1
4,37,456,757,542,10455,104,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
file

In [74]:
total_caption = pd.concat([file, caption_df], axis = 1)
total_caption.head(2)

Unnamed: 0,image_name,comment_number,comment,0,1,2,3,4,5,6,...,68,69,70,71,72,73,74,75,76,77
0,1000092795.jpg,0,Two young guys with shaggy hair look at their ...,37,10,288,0,2075,81,147,...,1,1,1,1,1,1,1,1,1,1
1,1000092795.jpg,1,Two young white males are outside near many b...,37,10,7,662,0,33,55,...,1,1,1,1,1,1,1,1,1,1


In [75]:
# 전처리 완료한 caption을 csv로 저장
total_caption.to_csv('encoded_data_v2.csv')