In [4]:
!pip install peft
!pip install datasets



In [None]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from tqdm import tqdm
import os
from collections import Counter
from peft import PeftModel, PeftConfig
import seaborn as sns
import matplotlib.pyplot as plt


class SentimentAnalyzer:
    def __init__(self, base_model_path="nlptown/bert-base-multilingual-uncased-sentiment", 
                 lora_model_path=None):
        # GPU 사용 여부 확인
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Load base model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_path)
        base_model = AutoModelForSequenceClassification.from_pretrained(base_model_path)
        
        if lora_model_path:
            # Load and merge LoRA weights
            config = PeftConfig.from_pretrained(lora_model_path)
            peft_model = PeftModel.from_pretrained(base_model, lora_model_path)
            # Merge LoRA weights with base model
            merged_model = peft_model.merge_and_unload()
            
            self.classifier = pipeline(
                "sentiment-analysis",
                model=merged_model,
                tokenizer=self.tokenizer,
                top_k=None,
                device=device
            )
        else:
            self.classifier = pipeline(
                "sentiment-analysis",
                model=base_model,
                tokenizer=self.tokenizer,
                top_k=None,
                device=device
            )

    def get_sentiments(self, text):
        """연속형과 이산형 점수 모두 반환"""
        scores = self.classifier(text)[0]

        # 연속형 점수 계산 (가중 평균)
        weighted_score = sum(float(score['label'][0]) * score['score'] for score in scores)
        continuous_score = round(weighted_score, 3)

        # 이산형 점수 계산 (가장 높은 확률의 별점)
        discrete_label = max(scores, key=lambda x: x['score'])['label']
        discrete_score = int(discrete_label[0])
        discrete_confidence = round(max(scores, key=lambda x: x['score'])['score'], 3)

        return continuous_score, discrete_score, discrete_confidence

    def analyze_long_text(self, text, max_tokens=450):
    
        # 원본 텍스트의 토큰 수 확인
        original_tokens = self.tokenizer.encode(text)
        was_split = len(original_tokens) > max_tokens

        if not was_split:
            # 토큰 수가 max_tokens 이하면 그대로 처리
            continuous_score, discrete_score, discrete_conf = self.get_sentiments(text)
            return continuous_score, discrete_score, discrete_conf, was_split

        # 토큰을 청크로 나누기 (문장 단위로 나누기 추가)
        chunks = []
        current_chunk = []
        current_length = 0
        
        # 문장 단위로 텍스트 분할
        sentences = text.replace('?', '.').replace('!', '.').split('.')
        
        for sentence in sentences:
            sentence = sentence.strip() + '.'
            sentence_tokens = self.tokenizer.encode(sentence)
            
            if current_length + len(sentence_tokens) <= max_tokens:
                current_chunk.append(sentence)
                current_length += len(sentence_tokens)
            else:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = len(sentence_tokens)
        
        if current_chunk:
            chunks.append(' '.join(current_chunk))

        # 각 청크 분석
        continuous_scores = []
        discrete_scores = []
        confidence_scores = []
        chunk_weights = []  # 청크 길이에 기반한 가중치

        for chunk in chunks:
            try:
                chunk_tokens = len(self.tokenizer.encode(chunk))
                weight = chunk_tokens / len(original_tokens)
                
                cont_score, disc_score, conf = self.get_sentiments(chunk)
                
                continuous_scores.append(cont_score)
                discrete_scores.append(disc_score)
                confidence_scores.append(conf)
                chunk_weights.append(weight)
                
            except Exception as e:
                print(f"청크 처리 중 오류 발생: {str(e)}")
                continue

        if not continuous_scores:
            raise Exception("모든 청크 처리 실패")

        # 가중 평균으로 최종 점수 계산
        avg_continuous = round(
            sum(score * weight for score, weight in zip(continuous_scores, chunk_weights)), 3
        )
        
        # 가중치가 적용된 이산형 점수 계산
        weighted_discrete_scores = []
        for score, weight in zip(discrete_scores, chunk_weights):
            weighted_discrete_scores.extend([score] * int(weight * 100))
        most_common_rating = Counter(weighted_discrete_scores).most_common(1)[0][0]
        
        # 평균 confidence
        avg_confidence = round(
            sum(conf * weight for conf, weight in zip(confidence_scores, chunk_weights)), 3
        )

        return avg_continuous, most_common_rating, avg_confidence, was_split

    def read_file(self, file_path):
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == '.csv':
            encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
            for encoding in encodings:
                try:
                    return pd.read_csv(file_path, encoding=encoding)
                except UnicodeDecodeError:
                    continue
                except Exception as e:
                    print(f"Error with {encoding} encoding: {str(e)}")
                    continue
            raise ValueError(f"Could not read file with any of the encodings: {encodings}")
        elif file_extension in ['.xlsx', '.xls']:
            return pd.read_excel(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

    def save_file(self, df, file_path = None):
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == '.csv':
            df.to_csv(file_path, index=False)
        elif file_extension in ['.xlsx', '.xls']:
            df.to_excel(file_path, index=False)

    def process_file(self, file_path, text_column, save_path=None, batch_size=1000):
        df = self.read_file(file_path)
        if df["Rating"].value_counts().shape[0] > 5:
            def convert_to_categories(rating):
                if rating in [1, 2]:
                    return 1
                elif rating in [3, 4]:
                    return 2
                elif rating in [5, 6]:
                    return 3
                elif rating in [7, 8]:
                    return 4
                else:
                    return 5

            # 등급 변환 적용
            df['Rating'] = df['Rating'].apply(convert_to_categories)
        total_rows = len(df)
        num_batches = (total_rows + batch_size - 1) // batch_size

        # 결과 컬럼만 생성
        df['sentiment_score_continuous'] = None
        df['sentiment_score_discrete'] = None

        print(f"\nCUDA 사용 가능: {torch.cuda.is_available()}")
        print(f"현재 장치: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")
        print(f"총 {total_rows}개 데이터 처리 시작\n")

        for i in tqdm(range(0, total_rows, batch_size), desc="Processing"):
            batch_end = min(i + batch_size, total_rows)

            for idx in range(i, batch_end):
                try:
                    text = str(df.loc[idx, text_column])
                    if pd.isna(text) or text.strip() == '':
                        df.loc[idx, 'sentiment_score_continuous'] = None
                        df.loc[idx, 'sentiment_score_discrete'] = None
                    else:
                        cont_score, disc_score, _, _ = self.analyze_long_text(text)
                        df.loc[idx, 'sentiment_score_continuous'] = cont_score
                        df.loc[idx, 'sentiment_score_discrete'] = disc_score

                except Exception as e:
                    print(f"\nError processing row {idx}: {str(e)}")
                    df.loc[idx, 'sentiment_score_continuous'] = None
                    df.loc[idx, 'sentiment_score_discrete'] = None

            save_path = save_path or file_path
            self.save_file(df, save_path)
            print(f"\n배치 {i // batch_size + 1}/{num_batches} 처리 완료")

        print("\n전체 처리 완료")
        print("\n별점 분포:")
        print(df['sentiment_score_discrete'].value_counts().sort_index())
        

        return df
    def EDA_data(self, data):
        plt.figure(figsize=(12,6))
        
        plt.subplot(1,2,1)
        plt.title("original data distribution")
        plt.hist(data["Rating"],color = "blue", bins = 5)
        
        ax1 = plt.subplot(1,2,2)
        plt.title("bert sentiment distribution")
        ax1.hist(data["sentiment_score_discrete"],color = 'green', alpha = 0.6, bins = 5)
        ax2 = ax1.twinx()
        sns.kdeplot(data = data["sentiment_score_continuous"], color = 'red', ax = ax2)
        
        plt.tight_layout()
        plt.show()
        
        


if __name__ == "__main__":
    # LoRA 모델을 사용하는 경우
    analyzer = SentimentAnalyzer(
    base_model_path="nlptown/bert-base-multilingual-uncased-sentiment",
    lora_model_path=None  # LoRA 모델 경로를 None으로 설정
)

    df = analyzer.process_file(
        file_path=r'C:\Users\Administrator\Desktop\PADA_LAB\calculated\final\hotel_reviews_with_topics_and_depth_breadth.csv',
        text_column='Review_Text',
        save_path=r'C:\Users\Administrator\Desktop\PADA_LAB\calculated\sampled_hotel_BERT_test.csv',
        batch_size=1500
    )
    analyzer.EDA_data(df)

    # 리소스 해제
    del analyzer.classifier
    del analyzer

  from pandas.core import (



CUDA 사용 가능: False
현재 장치: cpu
총 105034개 데이터 처리 시작



Processing:   0%|          | 0/71 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (765 > 512). Running this sequence through the model will result in indexing errors
Processing:   1%|▏         | 1/71 [01:31<1:46:11, 91.02s/it]


배치 1/71 처리 완료


Processing:   1%|▏         | 1/71 [03:15<3:47:33, 195.05s/it]


KeyboardInterrupt: 

In [17]:
columns = ['Rating','Review_Text','sentiment_score_continuous', 'sentiment_score_discrete']
df_select = df[columns].copy()
df_select["deviation"] = df_select["Rating"] - df_select["sentiment_score_continuous"]
df_select['abs_deviation'] = abs(df_select['deviation'])
df_select = df_select.sort_values('abs_deviation', ascending=False)
df_select.head(20)
df_select.to_csv(r'C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\Rating_sample.csv ', index=False)

### 원본 별점 분포

별점 분포:
sentiment_score_discrete
1     31
2     37
3    111
4    448
5    346
Name: count, dtype: int64

### 기본 모델 별점 분포

별점 분포:
sentiment_score_discrete
1     41
2    105
3    172
4    401
5    254
Name: count, dtype: int64

### LoRA 파인튜닝 모델

별점 분포:
sentiment_score_discrete
1     64
2      2
3     19
4    637
5    251
Name: count, dtype: int64
