In [2]:
import os
import pandas as pd
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import time

# 모델 정보
model_name_or_path = "heegyu/EEVE-Korean-Instruct-10.8B-v1.0-GGUF"
model_basename = "ggml-model-Q4_K_M.gguf"

# 모델 파일 경로
model_path = os.path.join(os.getcwd(), model_basename)

# 모델 다운로드 (필요한 경우에만)
if not os.path.exists(model_path):
    print("모델 파일이 로컬에 없습니다. 다운로드를 시작합니다...")
    model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
    print(f"모델이 다운로드되었습니다: {model_path}")
else:
    print(f"모델 파일이 이미 존재합니다: {model_path}")
# GPU 사용 설정 (코랩 환경에 맞게 조정)
llm = Llama(
    model_path=model_path,
    n_threads=2,  # CPU 코어 수
    n_batch=512,  # 1과 n_ctx 사이여야 하며, GPU VRAM 양을 고려해야 합니다.
    n_gpu_layers=43,  # 모델과 GPU VRAM에 따라 이 값을 조정하세요.
    n_ctx=4096,  # 컨텍스트 윈도우
)

# Excel 파일 읽기 (처음 3개의 행만)
df = pd.read_excel('finalDAta.xlsx', nrows=3)

# 프롬프트 템플릿 정의
template = """다음은 그림에 대한 정보입니다:

제목: {title}
평론: {description}
유형: {type}
상세 설명: {description_plus}

위의 정보를 바탕으로, 그림에 대한 객관적이고 간결한 설명을 100단어 이내로 작성해주세요.
그림의 주요 소재, 스타일, 그리고 전반적인 분위기에 초점을 맞춰주세요.

객관적 설명:
"""

# 각 행에 대해 설명 생성
def generate_description(row):
    prompt = template.format(
        title=row['TITLE'],
        description=row['DESCRIPTION'],
        type=row['TYPE'],
        description_plus=row['description_plus']
    )
    response = llm(
        prompt=prompt,
        max_tokens=256,
        temperature=0.7,
        top_p=0.95,
        top_k=50,
        stop=['Human:', '\n\n'],
        echo=False
    )
    return response['choices'][0]['text'].strip()

# 새로운 설명 생성 및 저장
df['description_final'] = df.apply(generate_description, axis=1)

# 결과를 Excel 파일로 저장
df.to_excel('tokenized_semart_test_combined_with_final_description.xlsx', index=False)

print("처리가 완료되었습니다. 결과가 새 Excel 파일에 저장되었습니다.")

# 결과 출력
print("\n처리된 데이터:")
print(df[['DESCRIPTION', 'TYPE', 'description_plus', 'description_final']])

모델 파일이 로컬에 없습니다. 다운로드를 시작합니다...


ggml-model-Q4_K_M.gguf:   0%|          | 0.00/6.51G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
llama_model_loader: loaded meta data with 24 key-value pairs and 435 tensors from C:\Users\brian\.cache\huggingface\hub\models--heegyu--EEVE-Korean-Instruct-10.8B-v1.0-GGUF\snapshots\9bf4892cf2017362dbadf99bd9a3523387135362\ggml-model-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_lengt

모델이 다운로드되었습니다: C:\Users\brian\.cache\huggingface\hub\models--heegyu--EEVE-Korean-Instruct-10.8B-v1.0-GGUF\snapshots\9bf4892cf2017362dbadf99bd9a3523387135362\ggml-model-Q4_K_M.gguf


llm_load_tensors:        CPU buffer size =  6210.02 MiB
...................................................................................................
llama_new_context_with_model: n_ctx      = 4096
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:        CPU KV buffer size =   768.00 MiB
llama_new_context_with_model: KV self size  =  768.00 MiB, K (f16):  384.00 MiB, V (f16):  384.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.16 MiB
llama_new_context_with_model:        CPU compute buffer size =   296.01 MiB
llama_new_context_with_model: graph nodes  = 1542
llama_new_context_with_model: graph splits = 1
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | 

FileNotFoundError: [Errno 2] No such file or directory: 'finalDAta.xlsx'

In [3]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List, Optional
from datetime import datetime

class RestaurantData(BaseModel):
    개설일자: str = Field(description="Date of establishment in YYYY-MM-DD format")
    주소: str = Field(description="Address in 제주시 XX동 format")
    가맹점명_포함_텍스트: str = Field(description="Text containing store name")
    이용_건수_상위: int = Field(description="Usage count percentile", le=100, ge=0)
    총_매출_상위: int = Field(description="Total sales percentile", le=100, ge=0)
    건당_이용_금액_상위: int = Field(description="Amount per usage percentile", le=100, ge=0)
    여행_요일: str = Field(description="Preferred days of travel (comma-separated)")
    성별_선호: str = Field(description="Gender preference (남 or 여)")
    선호_나이대: str = Field(description="Preferred age group (20,30,40,50,60)")
    현지인맛집: bool = Field(description="Whether it's a local favorite")
    분류: str = Field(description="Restaurant category (카페, 한식, 중식, 단품요리)")
    오름차순: str = Field(description="Ascending order feature")

    class Config:
        validate_assignment = True

    @classmethod
    def validate_date_format(cls, value: str) -> bool:
        try:
            datetime.strptime(value, '%Y-%m-%d')
            return True
        except ValueError:
            return False

    @classmethod
    def validate_days(cls, value: str) -> bool:
        valid_days = {'월', '화', '수', '목', '금', '토', '일'}
        days = {day.strip() for day in value.split(',')}
        return all(day in valid_days for day in days)

    def validate_fields(self) -> List[str]:
        errors = []
        
        # Validate date format
        if not self.validate_date_format(self.개설일자):
            errors.append("Invalid date format. Use YYYY-MM-DD")
            
        # Validate address format
        if not self.주소.startswith("제주시"):
            errors.append("Address must start with '제주시'")
            
        # Validate percentiles
        for field, value in {
            "이용_건수_상위": self.이용_건수_상위,
            "총_매출_상위": self.총_매출_상위,
            "건당_이용_금액_상위": self.건당_이용_금액_상위
        }.items():
            if not (0 <= value <= 100):
                errors.append(f"{field} must be between 0 and 100")
                
        # Validate days
        if not self.validate_days(self.여행_요일):
            errors.append("Invalid day format")
            
        # Validate gender
        if self.성별_선호 not in ["남", "여"]:
            errors.append("Gender must be either '남' or '여'")
            
        # Validate age group
        if self.선호_나이대 not in ["20", "30", "40", "50", "60"]:
            errors.append("Invalid age group")
            
        # Validate category
        if self.분류 not in ["카페", "한식", "중식", "단품요리"]:
            errors.append("Invalid category")
            
        return errors

def create_restaurant_parser():
    """Create a JSON parser for restaurant data"""
    return JsonOutputParser(pydantic_object=RestaurantData)

def process_restaurant_data(text_output: str) -> Optional[RestaurantData]:
    """Process the text output and return structured restaurant data"""
    try:
        parser = create_restaurant_parser()
        parsed_data = parser.parse(text_output)
        
        # Validate the parsed data
        validation_errors = parsed_data.validate_fields()
        if validation_errors:
            print("Validation errors:", validation_errors)
            return None
            
        return parsed_data
    except Exception as e:
        print(f"Error parsing data: {str(e)}")
        return None

# Example usage:
if __name__ == "__main__":
    sample_output = '''
    {
        "개설일자": "2023-01-15",
        "주소": "제주시 노형동",
        "가맹점명_포함_텍스트": "제주돌담식당",
        "이용_건수_상위": 85,
        "총_매출_상위": 75,
        "건당_이용_금액_상위": 65,
        "여행_요일": "월,화,일",
        "성별_선호": "여",
        "선호_나이대": "30",
        "현지인맛집": true,
        "분류": "한식",
        "오름차순": "성별_선호"
    }
    '''
    
    result = process_restaurant_data(sample_output)
    if result:
        print("Successfully parsed restaurant data:", result.dict())

Error parsing data: 'dict' object has no attribute 'validate_fields'



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
%pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.1.tar.gz (63.9 MB)
     ---------------------------------------- 0.0/63.9 MB ? eta -:--:--
     --------------------------------------- 0.0/63.9 MB 435.7 kB/s eta 0:02:27
     --------------------------------------- 0.1/63.9 MB 980.4 kB/s eta 0:01:06
     ---------------------------------------- 0.5/63.9 MB 3.7 MB/s eta 0:00:18
      --------------------------------------- 1.1/63.9 MB 5.8 MB/s eta 0:00:11
      --------------------------------------- 1.5/63.9 MB 6.5 MB/s eta 0:00:10
     - -------------------------------------- 1.9/63.9 MB 6.9 MB/s eta 0:00:10
     - -------------------------------------- 2.4/63.9 MB 7.2 MB/s eta 0:00:09
     - -------------------------------------- 2.8/63.9 MB 7.5 MB/s eta 0:00:09
     -- ------------------------------------- 3.4/63.9 MB 8.0 MB/s eta 0:00:08
     -- ------------------------------------- 3.9/63.9 MB 8.4 MB/s eta 0:00:08
     -- ------------------------------------- 4.5/63.9

  error: subprocess-exited-with-error
  
  × Building wheel for llama-cpp-python (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [20 lines of output]
      [32m*** [1mscikit-build-core 0.10.7[0m using [34mCMake 3.30.5[39m[0m [31m(wheel)[0m
      [32m***[0m [1mConfiguring CMake...[0m
      loading initial cache file C:\Users\brian\AppData\Local\Temp\tmpnwresmb_\build\CMakeInit.txt
      -- Building for: NMake Makefiles
      CMake Error at CMakeLists.txt:3 (project):
        Running
      
         'nmake' '-?'
      
        failed with:
      
         no such file or directory
      
      
      CMake Error: CMAKE_C_COMPILER not set, after EnableLanguage
      CMake Error: CMAKE_CXX_COMPILER not set, after EnableLanguage
      -- Configuring incomplete, errors occurred!
      [31m
      [1m***[0m [31mCMake configuration failed[0m
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR:

In [9]:
import os
import pandas as pd
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import time

def download_model(model_name_or_path, model_basename, model_dir):
    """모델 다운로드 함수"""
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, model_basename)
    
    if not os.path.exists(model_path):
        print("모델 파일이 로컬에 없습니다. 다운로드를 시작합니다...")
        downloaded_path = hf_hub_download(
            repo_id=model_name_or_path,
            filename=model_basename,
            local_dir=model_dir
        )
        print(f"모델이 다운로드되었습니다: {downloaded_path}")
        return downloaded_path
    else:
        print(f"모델 파일이 이미 존재합니다: {model_path}")
        return model_path

def initialize_llm(model_path):
    """LLM 초기화 함수"""
    try:
        return Llama(
            model_path=model_path,
            n_threads=os.cpu_count(),  # 시스템의 CPU 코어 수에 맞게 자동 설정
            n_batch=512,
            n_gpu_layers=43,
            n_ctx=4096,
        )
    except Exception as e:
        print(f"LLM 초기화 중 오류 발생: {str(e)}")
        raise

def generate_description(row, llm, template):
    """개별 행에 대한 설명 생성 함수"""
    try:
        prompt = template.format(
            title=str(row['TITLE']),
            description=str(row['DESCRIPTION']),
            type=str(row['TYPE']),
            description_plus=str(row['description_plus'])
        )
        
        response = llm(
            prompt=prompt,
            max_tokens=256,
            temperature=0.7,
            top_p=0.95,
            top_k=50,
            stop=['Human:', '\n\n'],
            echo=False
        )
        return response['choices'][0]['text'].strip()
    except Exception as e:
        print(f"행 처리 중 오류 발생: {str(e)}")
        return f"오류 발생: {str(e)}"

def main():
    # 설정
    model_name_or_path = "heegyu/EEVE-Korean-Instruct-10.8B-v1.0-GGUF"
    model_basename = "ggml-model-Q4_K_M.gguf"
    model_dir = "C:/model"
    
    # 프롬프트 템플릿
    template = """다음은 그림에 대한 정보입니다:

제목: {title}
평론: {description}
유형: {type}
상세 설명: {description_plus}

위의 정보를 바탕으로, 그림에 대한 객관적이고 간결한 설명을 100단어 이내로 작성해주세요.
그림의 주요 소재, 스타일, 그리고 전반적인 분위기에 초점을 맞춰주세요.

객관적 설명:
"""

    try:
        # 모델 다운로드
        model_path = download_model(model_name_or_path, model_basename, model_dir)
        
        # LLM 초기화
        llm = initialize_llm(model_path)
        
        # Excel 파일 읽기
        print("Excel 파일을 읽는 중...")
        df = pd.read_excel('finalDAta.xlsx', nrows=3)
        
        # 설명 생성
        print("설명을 생성하는 중...")
        descriptions = []
        for idx, row in df.iterrows():
            print(f"행 {idx + 1} 처리 중...")
            description = generate_description(row, llm, template)
            descriptions.append(description)
            time.sleep(1)  # API 요청 간 간격
        
        df['description_final'] = descriptions
        
        # 결과 저장
        output_file = 'tokenized_semart_test_combined_with_final_description.xlsx'
        df.to_excel(output_file, index=False)
        print(f"\n처리가 완료되었습니다. 결과가 {output_file}에 저장되었습니다.")
        
        # 결과 출력
        print("\n처리된 데이터:")
        print(df[['DESCRIPTION', 'TYPE', 'description_plus', 'description_final']])
        
    except Exception as e:
        print(f"프로그램 실행 중 오류 발생: {str(e)}")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'llama_cpp'

In [10]:
%pip install llama-cpp-python

^C
Note: you may need to restart the kernel to use updated packages.


Collecting llama-cpp-python
  Using cached llama_cpp_python-0.3.1.tar.gz (63.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml): started
  Building wheel for llama-cpp-python (pyproject.toml): still running...
  Building wheel for llama-cpp-python (pyproject.toml): finished with status 'done'
  Created wheel for llama-cpp-py