In [23]:
!pip install peft
!pip install datasets

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting safetensors (from peft)
  Downloading safetensors-0.4.5-cp310-none-win_amd64.whl.metadata (3.9 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
Downloading accelerate-1.2.1-py3-none-any.whl (336 kB)
Downloading safetensors-0.4.5-cp310-none-win_amd64.whl (285 kB)
Installing collected packages: safetensors, accelerate, peft
  Attempting uninstall: safetensors
    Found existing installation: safetensors 0.4.2
    Uninstalling safetensors-0.4.2:
      Successfully uninstalled safetensors-0.4.2
Successfully installed accelerate-1.2.1 peft-0.14.0 safetensors-0.4.5


  You can safely remove it manually.


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading pyarrow-18.1.0-cp310-cp310-win_amd64.whl (25.1 MB)
   ---------------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.41 requires requests_mock, which is not installed.
tensorflow 2.10.0 requires libclang>=13.0.0, which is not installed.
tensorflow 2.10.0 requires tensorflow-io-gcs-filesystem>=0.23.1, which is not installed.
conda-repo-cli 1.0.41 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.41 requires nbformat==5.4.0, but you have nbformat 5.7.0 which is incompatible.
conda-repo-cli 1.0.41 requires requests==2.28.1, but you have requests 2.32.3 which is incompatible.
tensorboard 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.
tensorflow 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm
import os
from collections import Counter
from peft import PeftModel, PeftConfig

class SentimentAnalyzer:
    def __init__(self, base_model_path="nlptown/bert-base-multilingual-uncased-sentiment", 
                 lora_model_path=None):
        # GPU 사용 여부 확인
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Load base model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_path)
        base_model = AutoModelForSequenceClassification.from_pretrained(base_model_path)
        
        if lora_model_path:
            # Load and merge LoRA weights
            config = PeftConfig.from_pretrained(lora_model_path)
            peft_model = PeftModel.from_pretrained(base_model, lora_model_path)
            # Merge LoRA weights with base model
            merged_model = peft_model.merge_and_unload()
            
            self.classifier = pipeline(
                "sentiment-analysis",
                model=merged_model,
                tokenizer=self.tokenizer,
                top_k=None,
                device=device
            )
        else:
            self.classifier = pipeline(
                "sentiment-analysis",
                model=base_model,
                tokenizer=self.tokenizer,
                top_k=None,
                device=device
            )

    def get_sentiments(self, text):
        """연속형과 이산형 점수 모두 반환"""
        scores = self.classifier(text)[0]

        # 연속형 점수 계산 (가중 평균)
        weighted_score = sum(float(score['label'][0]) * score['score'] for score in scores)
        continuous_score = round(weighted_score, 3)

        # 이산형 점수 계산 (가장 높은 확률의 별점)
        discrete_label = max(scores, key=lambda x: x['score'])['label']
        discrete_score = int(discrete_label[0])
        discrete_confidence = round(max(scores, key=lambda x: x['score'])['score'], 3)

        return continuous_score, discrete_score, discrete_confidence

    def analyze_long_text(self, text, max_tokens=450):
        # 원본 텍스트의 토큰 수 확인
        original_tokens = self.tokenizer.encode(text)
        was_split = len(original_tokens) > max_tokens

        if not was_split:
            # 토큰 수가 max_tokens 이하면 그대로 처리
            continuous_score, discrete_score, discrete_conf = self.get_sentiments(text)
            return continuous_score, discrete_score, discrete_conf, was_split

        # 토큰을 청크로 나누기
        chunks = []
        for i in range(0, len(original_tokens), max_tokens):
            chunk_tokens = original_tokens[i:i+max_tokens]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(chunk_text)

        # 각 청크 분석
        continuous_scores = []
        discrete_scores = []
        confidence_scores = []

        for chunk in chunks:
            try:
                cont_score, disc_score, conf = self.get_sentiments(chunk)
                continuous_scores.append(cont_score)
                discrete_scores.append(disc_score)
                confidence_scores.append(conf)
            except Exception as e:
                print(f"Error processing chunk: {str(e)}")
                continue

        if not continuous_scores:  # 모든 청크가 실패한 경우
            raise Exception("Failed to process all chunks")

        # 평균 계산
        avg_continuous = round(sum(continuous_scores) / len(continuous_scores), 3)
        # 가장 빈번한 별점
        most_common_rating = Counter(discrete_scores).most_common(1)[0][0]
        # 평균 confidence
        avg_confidence = round(sum(confidence_scores) / len(confidence_scores), 3)

        return avg_continuous, most_common_rating, avg_confidence, was_split

    def read_file(self, file_path):
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == '.csv':
            encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
            for encoding in encodings:
                try:
                    return pd.read_csv(file_path, encoding=encoding)
                except UnicodeDecodeError:
                    continue
                except Exception as e:
                    print(f"Error with {encoding} encoding: {str(e)}")
                    continue
            raise ValueError(f"Could not read file with any of the encodings: {encodings}")
        elif file_extension in ['.xlsx', '.xls']:
            return pd.read_excel(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

    def save_file(self, df, file_path):
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == '.csv':
            df.to_csv(file_path, index=False)
        elif file_extension in ['.xlsx', '.xls']:
            df.to_excel(file_path, index=False)

    def process_file(self, file_path, text_column, save_path=None, batch_size=1000):
        df = self.read_file(file_path)
        total_rows = len(df)
        num_batches = (total_rows + batch_size - 1) // batch_size

        # 결과 컬럼만 생성
        df['sentiment_score_continuous'] = None
        df['sentiment_score_discrete'] = None

        print(f"\nCUDA 사용 가능: {torch.cuda.is_available()}")
        print(f"현재 장치: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")
        print(f"총 {total_rows}개 데이터 처리 시작\n")

        for i in tqdm(range(0, total_rows, batch_size), desc="Processing"):
            batch_end = min(i + batch_size, total_rows)

            for idx in range(i, batch_end):
                try:
                    text = str(df.loc[idx, text_column])
                    if pd.isna(text) or text.strip() == '':
                        df.loc[idx, 'sentiment_score_continuous'] = None
                        df.loc[idx, 'sentiment_score_discrete'] = None
                    else:
                        cont_score, disc_score, _, _ = self.analyze_long_text(text)
                        df.loc[idx, 'sentiment_score_continuous'] = cont_score
                        df.loc[idx, 'sentiment_score_discrete'] = disc_score

                except Exception as e:
                    print(f"\nError processing row {idx}: {str(e)}")
                    df.loc[idx, 'sentiment_score_continuous'] = None
                    df.loc[idx, 'sentiment_score_discrete'] = None

            save_path = save_path or file_path
            self.save_file(df, save_path)
            print(f"\n배치 {i // batch_size + 1}/{num_batches} 처리 완료")

        print("\n전체 처리 완료")
        print("\n별점 분포:")
        print(df['sentiment_score_discrete'].value_counts().sort_index())

        return df


if __name__ == "__main__":
    # LoRA 모델을 사용하는 경우
    analyzer = SentimentAnalyzer(
        base_model_path="nlptown/bert-base-multilingual-uncased-sentiment",
        lora_model_path=r"C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\model\hotel_fine_tuned_model"  # LoRA 모델 경로 지정
    )

    df = analyzer.process_file(
    file_path=r'C:\Users\Administrator\Desktop\PADA_LAB\calculated\sampled_hotel_1000_calculated_ctm.csv',
    text_column='Review_Text',
    save_path=r'C:\Users\Administrator\Desktop\PADA_LAB\calculated\sampled_hotel_BERT_results.csv',
    batch_size=1500
)

    # 리소스 해제
    del analyzer.classifier
    del analyzer


CUDA 사용 가능: False
현재 장치: cpu
총 973개 데이터 처리 시작



Processing:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
!pip install peft





In [32]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from peft import PeftModel

# 이미 파인튜닝된 LoRA 모델 로드
model_path = r"C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\model\hotel_fine_tuned_model"  # 파인튜닝된 모델 경로
tokenizer = BertTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# 베이스 모델 로드
base_model = BertForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# LoRA 모델 결합
model = PeftModel.from_pretrained(base_model, model_path).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# 감정 분석 함수
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return "Positive" if predictions.item() == 1 else "Negative"

# 예시 텍스트 감정 분석
text = "I love this movie! It's amazing."
result = analyze_sentiment(text)
print(f"Sentiment: {result}")


ImportError: cannot import name 'GatedRepoError' from 'huggingface_hub.errors' (c:\Users\Administrator\anaconda3\lib\site-packages\huggingface_hub\errors.py)

In [19]:
!pip install transformers==4.45.2 adapter-transformers==4.0.0




In [20]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = r"C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\model\hotel_fine_tuned_model"

# Tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 모델 로드 (어댑터 없이 일반 모델 사용)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

model.eval()

print("Tokenizer와 모델이 성공적으로 로드되었습니다.")


OSError: C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\model\hotel_fine_tuned_model does not appear to have a file named config.json. Checkout 'https://huggingface.co/C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\model\hotel_fine_tuned_model/tree/None' for available files.