<a href="https://colab.research.google.com/github/Zoella-Choi/DataAnalysis/blob/main/%EC%98%A4%ED%94%88%EC%86%8C%EC%8A%A4_%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%B6%84%EC%84%9D_14%EA%B0%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 14강 비정형 데이터 분석 : 패션 사진 데이터 활용

### 목표

- 비정형 데이터를 인공지능 모델로 분석하여 실무에서 활용 가능한 보고서 형태로 가공

- 패션 트렌드라는 구체적인 주제를 통해, 비정형 데이터 분석의 실질적인 활용 방안을 경험하고자 함


### 분석 프로세스 개요

1. 데이터 수집
  - requests를 이용한 RSS 데이터 수집
  - lxml을 이용한 XML 파싱
  - 이미지 데이터 추출
2. VLM을 이용한 이미지 분석
  - 프롬프트를 이용한 이미지 필터링
  - 프롬프트를 이용한 스타일 분석
3. LLM을 이용한 키워드 분석 및 보고서 작성
  - 텍스트 전처리
  - 색상 및 스타일 키워드 추출
  - 워드 클라우드 분석
  - 보고서 작성

# 주의 : 런타임 GPU 로 설정 필요

In [1]:
# 4bit VLM 처리를 위한 bitsandbytes 설치
# LLM 처리를 위한 VLLM 설치 (오래걸리는 작업(>5분)이므로 미리 실행!)
!pip install bitsandbytes==0.45.3 vllm==0.7.3 transformers==4.48.2
# 필요 시 세션 재시작

Collecting bitsandbytes==0.45.3
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting vllm==0.7.3
  Downloading vllm-0.7.3-cp38-abi3-manylinux1_x86_64.whl.metadata (25 kB)
Collecting transformers==4.48.2
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.17 (from bitsandbytes==0.45.3)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting blake3 (from vllm==0.7.3)
  Downloading blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm==0.7.3)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadat

In [1]:
# 한글 처리를 위한 matplotlib 설정 (1)

!sudo apt-get install -y fonts-nanum
!sudo fc-cache –fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 2s (5,938 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 126675 files and dire

- 런타임 -> 세션 다시 시작

In [1]:
# 한글 처리를 위한 matplotlib 설정 (2)

import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')

# 1. 데이터 수집 및 전처리

## 14-1 RSS 피드에서 이미지 URL 추출

In [2]:
import requests
from lxml import etree
from lxml.html import fromstring
import pandas as pd

def extract_unique_images(rss_url):
    ## 주어진 RSS 피드 URL에서 고유한 이미지 URL들을 추출하는 함수 정의
    try:
        ## requests 라이브러리를 사용하여 RSS 피드 URL로부터 내용을 가져옴
        response = requests.get(rss_url)
        ## 가져온 XML 응답 내용을 lxml의 etree.fromstring으로 파싱하여 XML 트리 root를 생성
        root = etree.fromstring(response.content)

        ## 이미지 URL을 저장할 빈 리스트 생성
        image_urls = set()

        ## XML 트리에서 모든 'item' 태그를 XPath를 사용하여 순회
        for item in root.xpath('//item'):
            description = item.find('description')
            if description is not None and description.text:
                ## description의 텍스트 내용을 lxml.html.fromstring으로 파싱하여 HTML 트리를 생성
                html_tree = fromstring(description.text)
                ## HTML 트리에서 첫 번째 <img> 태그의 'src' 속성 값을 XPath를 사용하여 추출
                img_url = html_tree.xpath('string(//img/@src)')
                if img_url:
                    image_urls.add(img_url)

        return list(image_urls)

    except Exception as e:
        ## 오류 발생 시 오류 메시지를 출력하고 빈 리스트를 반환
        print(f"Error occurred: {e}")
        return []

rss_url = "https://glltn.com/feed/"
## extract_unique_images 함수를 호출하여 고유한 이미지 URL들을 추출
unique_images = extract_unique_images(rss_url)

## 추출된 이미지 URL 리스트를 사용하여 'image'라는 열을 가진 pandas DataFrame을 생성
df = pd.DataFrame(unique_images, columns=["image"])

In [3]:
df

Unnamed: 0,image
0,https://glltn.com/wp-content/blogs.dir/1/files...
1,https://glltn.com/wp-content/blogs.dir/1/files...
2,https://glltn.com/wp-content/blogs.dir/1/files...
3,https://glltn.com/wp-content/blogs.dir/1/files...
4,https://glltn.com/wp-content/blogs.dir/1/files...
5,https://glltn.com/wp-content/blogs.dir/1/files...
6,https://glltn.com/wp-content/blogs.dir/1/files...
7,https://glltn.com/wp-content/blogs.dir/1/files...
8,https://glltn.com/wp-content/blogs.dir/1/files...
9,https://glltn.com/wp-content/blogs.dir/1/files...


## 14-2 수집 데이터 확인

In [4]:
from IPython.display import display, HTML

def path_to_image_html(path):
    ## 이미지 경로를 HTML img 태그로 변환하는 함수
    return f'<img src="{path}" width="300" />'

## DataFrame의 스타일을 설정하여 이미지 너비를 300px로 지정
df.style.set_table_styles([{'selector': 'img', 'props': 'width: 300px;'}])

## DataFrame을 HTML로 변환하여 출력. 이미지 열은 path_to_image_html 함수로 포맷팅
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


## 2. VLM을 이용한 이미지 분석

## 14-3 VLM 모델 로드

In [5]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

## 'openbmb/MiniCPM-V-2_6-int4' 모델을 사전 훈련된 가중치와 함께 로드
## trust_remote_code=True는 허브에서 사용자 정의 코드를 실행할 수 있도록 허용
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
## 로드된 모델에 해당하는 토크나이저를 로드
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
## 모델을 평가 모드로 설정 (드롭아웃 등 훈련 시에만 필요한 기능 비활성화)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

configuration_minicpm.py: 0.00B [00:00, ?B/s]

modeling_navit_siglip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- modeling_navit_siglip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- configuration_minicpm.py
- modeling_navit_siglip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_minicpmv.py: 0.00B [00:00, ?B/s]

resampler.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- modeling_minicpmv.py
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenization_minicpmv_fast.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- tokenization_minicpmv_fast.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

MiniCPMV(
  (llm): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151666, 3584)
      (layers): ModuleList(
        (0-27): 28 x Qwen2DecoderLayer(
          (self_attn): Qwen2Attention(
            (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
            (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
            (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
            (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
            (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
            (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=

![](https://farm3.staticflickr.com/2677/4434956914_6e95a22940_z.jpg)

## 14-4 이미지 질문 응답 예시

In [None]:
from transformers import set_seed

## 재현성을 위해 시드(seed)를 42로 설정
set_seed(42)
## 예시 이미지 URL 정의
image_url = 'https://farm3.staticflickr.com/2677/4434956914_6e95a22940_z.jpg'
## requests로 이미지 다운로드 후 PIL Image 객체로 열고 RGB 형식으로 변환
image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
## 이미지에 대한 질문 정의
question = 'how many cats in the photo?'
## 모델 입력 형식에 맞춰 메시지 구성 (이미지와 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 이미지와 질문에 대한 응답 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 출력
print(result)

In [None]:
set_seed(42)
## 이미지에 대한 질문을 업데이트. 책 표지의 고양이도 포함하도록 요청
question = 'how many cats in the photo? including the books cover.'
## 모델 입력 형식에 맞춰 메시지 구성 (이전에 로드된 이미지와 업데이트된 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 업데이트된 질문에 대한 응답 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 출력
print(result)

In [None]:
set_seed(86)
## 이미지에 대한 질문을 'describe the photo'로 설정하여 이미지 내용을 설명하도록 요청
question = 'describe the photo'
## 모델 입력 형식에 맞춰 메시지 구성 (이전에 로드된 이미지와 설명 요청 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 이미지에 대한 설명을 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 (이미지 설명) 출력
print(result)

## 14-5 의류 이미지 여부 판단

In [11]:
def is_picture_of_clothing(image_url):
    ## 이미지 URL이 의류 사진인지 판단하는 함수
    # 의류가 포함된 사진인지 확인하는 질문 작성 (영어로)
    question = 'Is this a picture of clothing? MUST say yes or no.'
    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, question]}]
    result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, temperature=0.1)
    print(result)
    ## 응답에 'yes'가 포함되어 있는지 확인하여 True/False 반환
    return 'yes' in result.lower()

## DataFrame의 'image' 열에 함수를 적용하여 'is_clothing' 열에 결과 저장
df['is_clothing'] = df['image'].apply(is_picture_of_clothing)

preprocessor_config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

processing_minicpmv.py: 0.00B [00:00, ?B/s]

image_processing_minicpmv.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- image_processing_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- processing_minicpmv.py
- image_processing_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Yes, this image appears to be a display of clothing. The man is wearing a sweater, shorts, and a hat, which are the main subjects of the photograph. The focus on his attire suggests that it could be used for fashion-related purposes such as an advertisement or a catalog showcasing these items.
Yes.
Yes, this image appears to be a picture of clothing. The focus is on the individual's attire, which includes a dark green shirt with a high collar and long sleeves, paired with patterned trousers. The style of the clothing suggests a contemporary fashion look, possibly from a designer collection or a modern streetwear brand. The photograph seems to be styled for display purposes, likely intended to showcase the garments in question.
No, the image is not of clothing. It is a photograph of a book cover featuring a man and a vintage car with an American flag. The content suggests that it is related to photography, specifically the work of Joel Meyerowitz over a span of several decades.
Yes.
Yes

## 14-6 의류 판단 결과 시각화

In [12]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing
0,,True
1,,True
2,,True
3,,False
4,,True
5,,True
6,,True
7,,False
8,,True
9,,True


## 14-7 의류 이미지 필터링

In [13]:
## 'is_clothing' 열의 값이 True인 행들만 필터링하여 DataFrame을 업데이트
df = df[df['is_clothing']]

In [14]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing
0,,True
1,,True
2,,True
4,,True
5,,True
6,,True
8,,True
9,,True
10,,True
11,,True


## 14-8 의류 스타일 분석

In [15]:
def describe_style(image_url):
    ## 주어진 이미지 URL의 의류 스타일을 분석하는 함수
    question = 'Analyze the style of the clothes. Please let me explain the colors and trend changes.'
    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, question]}]
    ## 모델의 chat 함수를 호출하여 이미지에 대한 스타일 분석 응답 생성
    result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
    return result

## 필터링된 DataFrame의 'image' 열에 describe_style 함수를 적용
## 결과는 'style'이라는 새로운 열에 저장
df['style'] = df['image'].apply(describe_style)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['style'] = df['image'].apply(describe_style)


In [16]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing,style
0,,True,"The style of the clothes in the image leans towards a casual, possibly outdoor-inspired look. The beige sweater with its textured appearance suggests a preference for comfort and warmth, which is often associated with leisure or travel attire. The use of neutral tones like beige and navy blue indicates a trend that favors understated, versatile colors over bold hues. This can be seen as part of a broader fashion shift towards minimalism and practicality.\n\nThe layered clothing, consisting of a short-sleeve shirt under the sweater, hints at a transitional season wardrobe, suitable for mild weather where additional layers can be added or removed as needed. The shorts paired with this ensemble suggest an intention to stay cool while maintaining a put-together appearance.\n\nOverall, the outfit reflects a contemporary casual style that balances comfort with a touch of ruggedness, potentially inspired by outdoor activities or urban streetwear trends."
1,,True,"The sweater the man is wearing appears to be a classic style with a Nordic or Fair Isle pattern, which is characterized by its intricate and colorful designs. This type of sweater has been popular for many years and often evokes a sense of warmth and coziness, making it suitable for colder seasons. The use of earthy tones such as beige, brown, and green suggests a preference for natural and muted colors that are versatile and timeless.\n\nIn terms of trend changes, sweaters like this one have seen periodic resurgences in popularity due to their comfort and ability to blend with various styles. While they might not follow the latest fashion trends, they remain a staple in casual wear because of their enduring appeal. The patterned design adds a touch of traditional craftsmanship, which can be appealing to those who appreciate artisanal clothing.\n\nOverall, the style of the clothes reflects a blend of practicality and aesthetic appeal, catering to individuals who value both comfort and tradition in their wardrobe choices."
2,,True,"The style of the clothes in the image leans towards a minimalist and perhaps slightly vintage-inspired aesthetic. The dark green button-up shirt has a classic design, which suggests it could be versatile for both casual and semi-formal occasions. The color choice is subdued and earthy, indicating a preference for understated elegance rather than bold fashion statements.\n\nThe maroon collar peeking out from under the shirt adds a subtle contrast to the overall look, suggesting attention to detail in layering. This can be seen as a modern twist on traditional styling, where layers are used not just for warmth but also for creating visual interest.\n\nThe patterned shorts, while not fully visible, seem to have a textured or dotted design, which adds depth and character to the outfit without overwhelming it. This kind of detailing is often found in contemporary men's fashion, blending comfort with a touch of sophistication.\n\nOverall, the clothing items chosen by the individual reflect a blend of timeless elements with modern touches, possibly hinting at an appreciation for classic styles that have been updated with current trends."
4,,True,"The style of the shoes in the image leans towards outdoor or hiking-inspired fashion. The color palette is earthy and muted, which is often associated with natural environments and outdoor activities. This choice of colors not only provides a practical benefit by blending with nature but also suggests a trend that favors understated, versatile, and functional designs over flashy or vibrant ones.\n\nIn recent years, there has been a resurgence of interest in utility and durability in clothing and footwear, particularly within urban streetwear culture. This reflects a broader trend where consumers are seeking products that offer both aesthetic appeal and functionality. The design of these shoes, with their robust construction and practical features like reinforced stitching and durable soles, aligns with this contemporary trend.\n\nFurthermore, the use of materials such as suede and leather indicates a preference for classic, timeless styles rather than fast-changing trends. These materials are known for their longevity and ability to withstand wear and tear, making them popular choices for items meant to be used extensively, such as hiking boots.\n\nOverall, the shoes can be seen as part of a larger movement towards sustainable and versatile fashion, emphasizing quality, comfort, and adaptability across different settings, including outdoor adventures."
5,,True,"The style of the clothes worn by the individual in the image suggests a modern, minimalist aesthetic with an emphasis on simplicity and functionality. The dark color palette is versatile and timeless, often associated with sophistication and understated elegance. The rolled-up sleeves add a casual yet intentional touch to the outfit, indicating a blend of formality and comfort.\n\nIn terms of current fashion trends, this ensemble could be categorized under contemporary urban wear or streetwear. These styles typically prioritize comfort, practicality, and a relaxed silhouette. The use of layering, as seen with the jacket over the shirt, is a common trend that allows for adaptability to changing temperatures and provides depth to the overall look.\n\nThe choice of a belt around the waist serves both a functional purpose and adds a visual break to the garment, enhancing the fit and giving the outfit a more polished appearance. This attention to detail is indicative of a thoughtful approach to personal styling, which has become increasingly popular in recent years.\n\nOverall, the clothing style reflects a modern sensibility that values both aesthetics and practicality, aligning well with current fashion trends that favor minimalism and versatility."
6,,True,"The style of the clothes worn by the individual in the image leans towards a casual and somewhat minimalist aesthetic. The light yellow shirt is a classic color that has been popular for its versatility and ability to convey a sense of freshness and simplicity. It's a timeless choice often associated with spring or summer wardrobes, suggesting a preference for lighter, breathable fabrics suitable for warmer weather.\n\nThe shirt's design features such as the collar, pocket placement, and button-down style are indicative of a traditional men's dress shirt but executed in a relaxed fit, which aligns with contemporary fashion trends favoring comfort and ease of movement over strict formality. This shift reflects a broader trend in fashion where practicality and personal expression take precedence over rigid adherence to conventional styles.\n\nThe light blue jeans complement the shirt well, maintaining the casual yet put-together look. Jeans are an evergreen staple in men's wardrobes due to their durability and versatility, making them a go-to choice for various occasions from casual outings to more laid-back professional settings.\n\nOverall, the clothing choices suggest a modern approach to dressing, balancing between comfort and a touch of sophistication without sacrificing either."
8,,True,"The style of the clothes worn by the individual in the image leans towards a casual yet sophisticated look, often associated with contemporary streetwear or smart-casual fashion. The olive green jacket is practical and versatile, commonly seen in outdoor or military-inspired attire. Its padded design suggests functionality for cooler weather, while the high collar adds an element of formality.\n\nThe layered look, combining a button-up shirt under the jacket, indicates a trend that favors depth and texture in clothing. This layering technique not only provides warmth but also creates visual interest through contrasting colors and fabrics. The blue-striped shirt introduces a classic pattern that contrasts with the solid color of the outerwear, adding to the overall aesthetic appeal.\n\nThe choice of dark pants complements the outfit without drawing too much attention away from the upper garments. Darker tones in clothing are generally considered more versatile and can pair well with various shades, making them a staple in many wardrobes.\n\nOverall, the combination of these elements reflects a modern approach to dressing where comfort meets style, suitable for both casual outings and slightly more formal occasions."
9,,True,"The style of the clothes in the image suggests a contemporary, urban fashion trend that prioritizes comfort and functionality. The olive green color of the parkas is versatile and has been popular in recent years for its earthy tone and ability to match with various colors. This hue often evokes a sense of ruggedness and practicality, which aligns well with outdoor or streetwear aesthetics.\n\nThe oversized fit of the jackets indicates a preference for loose-fitting clothing, which has become a significant trend in modern fashion. This style choice can provide ease of movement and comfort, as seen in the relaxed posture of both individuals. The presence of hoods and drawstrings further emphasizes the practical aspects of these garments, catering to cold weather conditions while also serving as stylish details.\n\nUnderneath the outer layers, the use of neutral shades like grey and black creates a cohesive look that is easy to coordinate with other pieces in a wardrobe. These understated colors are typical of minimalist fashion trends, where simplicity and subtlety are key elements.\n\nOverall, the clothing reflects a blend of casual and utilitarian styles, likely appealing to those who value both comfort and a modern, effortless aesthetic."
10,,True,"The clothing style depicted in the image leans towards a minimalist and monochromatic aesthetic, which is often associated with modern urban fashion. The use of black as the sole color choice suggests a preference for simplicity and versatility, as black is a classic color that can be easily paired with other colors or accessories.\n\nThe henley shirt is a casual yet stylish piece that has seen resurgence in popularity due to its relaxed fit and functional design. Henley shirts are known for their ease of wearability and comfort, making them a staple in contemporary casual wardrobes. The presence of buttons adds a subtle detail that breaks the monotony of the solid color, providing a touch of texture and interest without overwhelming the overall look.\n\nThe high-waisted pants complement the shirt by maintaining the monochromatic theme while also offering a trendy silhouette that has been popular in recent fashion trends. High-waisted pants are often chosen for their flattering effect on the waistline and hips, contributing to a balanced and proportionate appearance.\n\nOverall, the outfit reflects a current trend toward minimalist, monochromatic looks with an emphasis on comfort and casual elegance. This style is versatile enough to transition from daywear to more relaxed evening settings, showcasing a timeless appeal that transcends fleeting fashion cycles."
11,,True,"The style of the clothes worn by the individual in the image reflects a blend of classic and contemporary fashion elements. The black blazer is a timeless piece, often associated with formal or semi-formal attire, but its modern cut suggests a nod to current trends that favor a more relaxed fit over rigid silhouettes. The high-neck white garment underneath adds an unexpected layering element, which has been popular in recent fashion circles for adding depth and interest to an outfit.\n\nThe tie-dye pants are a significant departure from traditional clothing norms, introducing a bold pattern and color contrast that is indicative of a trend towards embracing vibrant prints and unconventional textures. Tie-dye, in particular, has seen a resurgence in popularity as a statement-making fabric choice, often used to inject personality into otherwise standard pieces.\n\nOverall, the combination of these items points to a fashion-forward approach where classic cuts meet modern, eye-catching patterns, creating a look that is both stylish and individualistic."


# 3. LLM을 이용한 키워드 분석 및 보고서 작성

## 14-9 언어 모델(LLM) 로드

In [19]:
from vllm import LLM, SamplingParams

## vLLM 라이브러리를 사용하여 'LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct' 모델을 로드
## gpu_memory_utilization은 GPU 메모리 사용 비율을 0.5로 설정
## max_model_len은 모델이 처리할 수 있는 최대 토큰 길이를 10000으로 설정
llm = LLM(model='LGAI-EXAONE-3.5-2.4B-Instruct', gpu_memory_utilization=0.5, max_model_len=10000)

ERROR 10-18 09:38:43 config.py:102] Error retrieving file list: 401 Client Error. (Request ID: Root=1-68f36023-3df0f80e1b21c64f4f4dd096;e9126732-2532-4d25-97ed-8f021d62c28c)
ERROR 10-18 09:38:43 config.py:102] 
ERROR 10-18 09:38:43 config.py:102] Repository Not Found for url: https://huggingface.co/api/models/LGAI-EXAONE-3.5-2.4B-Instruct/tree/main?recursive=True&expand=False.
ERROR 10-18 09:38:43 config.py:102] Please make sure you specified the correct `repo_id` and `repo_type`.
ERROR 10-18 09:38:43 config.py:102] If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
ERROR 10-18 09:38:43 config.py:102] Invalid username or password., retrying 1 of 2
ERROR 10-18 09:38:45 config.py:100] Error retrieving file list: 401 Client Error. (Request ID: Root=1-68f36025-51671fd4745439eb04ba1dcd;d78e317a-1314-4807-b8eb-6dc4299f6c8e)
ERROR 10-18 09:38:45 config.py:100] 
ERROR 10-18 

RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-68f36025-51671fd4745439eb04ba1dcd;d78e317a-1314-4807-b8eb-6dc4299f6c8e)

Repository Not Found for url: https://huggingface.co/api/models/LGAI-EXAONE-3.5-2.4B-Instruct/tree/main?recursive=True&expand=False.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
Invalid username or password.

## 14-10 색상 정보 추출

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

def extract_color(style):
  ## 주어진 스타일 설명 텍스트에서 색상을 한글로 추출하는 함수
  prompt = [
      {
          "role": "system",
          "content": "You are EXAONE model from LG AI Research, a helpful assistant."
      },
      {
          "role": "user",
          "content": f"" # vlm이 작성한 글에서 색상 정보 추출, 한글로 번역하면서
      }
  ]
  ## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
  sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=1024)
  ## LLM 모델을 사용하여 프롬프트에 대한 응답 생성
  result = llm.chat(prompt, sampling_params)[0].outputs[0].text
  print(result)
  return result

## DataFrame의 'style' 열에 extract_color 함수를 적용
## 결과는 'color'라는 새로운 열에 저장
df['color'] = df['style'].apply(extract_color)

## 14-11 스타일 키워드 추출

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

def extract_color(style):
  ## 주어진 스타일 설명 텍스트에서 스타일 키워드를 한글로 추출하는 함수
  prompt = [
      {
          "role": "system",
          "content": "You are EXAONE model from LG AI Research, a helpful assistant."
      },
      {
          "role": "user",
          "content": f"" # vlm이 작성한 글에서 스타일 키워드 추출, 한글로 번역하면서
      }
  ]
  ## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
  sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=1024)
  ## LLM 모델을 사용하여 프롬프트에 대한 응답 생성
  result = llm.chat(prompt, sampling_params)[0].outputs[0].text
  print(result)
  return result

## DataFrame의 'style' 열에 extract_color 함수를 적용 (함수 이름은 이전과 동일하지만 기능 변경)
## 결과는 'keyword'라는 새로운 열에 저장
df['keyword'] = df['style'].apply(extract_color)

In [None]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

## 14-12 텍스트 데이터 정제

In [None]:
import re

def clean_text(text):
    ## 텍스트에서 특수 문자 및 HTML 태그를 제거하고 소문자로 변환하는 함수
    if isinstance(text, str):
       ## 영문, 숫자, 한글, 공백을 제외한 모든 문자 제거
       text = re.sub(r'[^a-zA-Z0-9가-힣\s]', '', text)
       ## HTML 태그 제거
       text = re.sub(r'<[^>]*>', '', text)
       ## 텍스트를 소문자로 변환
       text = text.lower()
       return text
    else:
        return ""

## 'color' 열의 텍스트 데이터 정제
df['color'] = df['color'].apply(clean_text)
## 'keyword' 열의 텍스트 데이터 정제
df['keyword'] = df['keyword'].apply(clean_text)

## 14-13 워드 클라우드 생성 및 시각화

In [None]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def get_word_count(df):
    ## DataFrame의 'color'와 'keyword' 열에서 단어 빈도를 계산하는 함수
    if not df.empty:
        ## 'color' 열의 모든 단어를 리스트로 합침
        all_nouns = df['color'].apply(str.split).sum()
        ## 'keyword' 열의 모든 단어를 추가
        all_nouns += df['keyword'].apply(str.split).sum()
        ## '색상' 단어를 제외한 모든 단어를 필터링
        all_nouns = [word for word in all_nouns if word not in ['색상']]
        ## 단어 빈도를 Counter 객체로 반환
        return Counter(all_nouns)
    return Counter() ## DataFrame이 비어있으면 빈 Counter 반환

def create_wordcloud(word_count):
    ## 단어 빈도수를 기반으로 워드 클라우드를 생성하고 시각화하는 함수
    if not word_count: ## 단어 빈도가 없으면 워드클라우드 생성하지 않음
        print("No words to generate word cloud.")
        return

    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap='viridis',
        font_path='/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' ## 한글 폰트 경로 지정
        ).generate_from_frequencies(word_count)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off") ## 축 표시 제거
    plt.show() ## 워드 클라우드 출력

## DataFrame에서 단어 빈도 계산
word_count = get_word_count(df)
## 계산된 단어 빈도로 워드 클라우드 생성 및 시각화
create_wordcloud(word_count)

## 14-14 트렌드 분석 보고서 생성 프롬프트 구성 및 실행

## 14-15 분석 보고서 시각화

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

## 시스템 메시지로 시작하는 프롬프트 리스트 초기화
prompt = [
    {
        "role": "system",
        "content": "You are EXAONE model from LG AI Research, a helpful assistant."
    }
]
## DataFrame의 각 행을 순회하며 '스타일 노트'와 '이미지 URL'을 사용자 메시지로 추가
for row in df.itertuples():
  prompt.append({"role": "user", "content": f""})
## 마지막으로, 종합적인 트렌드 분석 보고서 작성을 요청하는 사용자 메시지 추가
## 보고서 제목, 내용의 전문성, 마크다운 형식, 예시 이미지 포함을 지시
prompt.append({"role": "user", "content": ""})

## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=4096)
## LLM 모델을 사용하여 구성된 프롬프트에 대한 응답 생성
result = llm.chat(prompt, sampling_params)[0].outputs[0].text

In [None]:
from IPython.display import display, Markdown

## LLM으로부터 생성된 결과(Markdown 형식의 보고서)를 Jupyter 환경에 표시
display(Markdown(result))