In [None]:
import pandas as pd
import numpy as np
import random

# 데이터 로드
df_amazon = pd.read_csv(r'C:\Users\Administrator\Desktop\PADA_LAB\raw\amazon.csv', encoding='utf-8')
df_audible = pd.read_csv(r'C:\Users\Administrator\Desktop\PADA_LAB\raw\audible.csv', encoding='utf-8')
df_coursera = pd.read_csv(r'C:\Users\Administrator\Desktop\PADA_LAB\raw\coursera.csv', encoding='cp949')
df_hotel = pd.read_csv(r'C:\Users\Administrator\Desktop\PADA_LAB\raw\hotel.csv', encoding='utf-8')

sampled_dfs = []
n = 100

for i in range(1, 6):
    # Amazon
    df_1 = df_amazon[df_amazon['Rating'] == i].sample(n)[['Rating', 'Review_Text']]
    
    # Audible
    df_2 = df_audible[df_audible['Rating'] == i].sample(n)[['Rating', 'Review_Text']]  # 컬럼명 확인 필요
    
    # Coursera
    df_3 = df_coursera[df_coursera['Rating'] == i].sample(n)[['Rating', 'Review_Text']]  # 컬럼명 확인 필요
    
    # Hotel
    df_4 = df_hotel[(df_hotel['Rating'] <= 2*i) & (df_hotel['Rating'] > 2*(i-1))].sample(n)[['Rating', 'Review_Text']]
    
    sampled_dfs.extend([df_1, df_2, df_3, df_4])

# 모든 샘플 합치기
df_sample = pd.concat(sampled_dfs).reset_index(drop=True)
df_sample.to_csv(r'C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\2000_sample.csv', index=False, encoding='utf-8')


# CSV 저장

In [24]:
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict

class ImprovedSentimentAnalyzer:
    def __init__(self, lexicon_embeddings_path: str = 'lexicon_embeddings.csv'):
        # Sentence Transformer 모델 로드
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # 렉시콘 데이터 로드
        self.lexicon_data = pd.read_csv(lexicon_embeddings_path)
        # 감정 단어들의 임베딩 생성
        self.word_embeddings = self.model.encode(self.lexicon_data['word'].tolist())
    
    def find_similar_words(self, text: str, top_k: int = 5) -> List[Dict]:
        """문장 임베딩과 가장 유사한 감정 단어들을 찾음"""
        # 문장 임베딩 생성
        sentence_embedding = self.model.encode(text)
        
        # 코사인 유사도 계산
        similarities = cosine_similarity([sentence_embedding], self.word_embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        similar_words = []
        for idx in top_indices:
            similar_words.append({
                'word': self.lexicon_data.iloc[idx]['word'],
                'similarity': similarities[idx],
                'valence': self.lexicon_data.iloc[idx]['valence'],
                'arousal': self.lexicon_data.iloc[idx]['arousal']
            })
        
        return similar_words

def main():
    # ImprovedSentimentAnalyzer 초기화
    analyzer = ImprovedSentimentAnalyzer(lexicon_embeddings_path=r"C:\Users\Administrator\Desktop\lexicon_embeddings_ver2.csv")
    
    # 리뷰 데이터 로드
    test_texts = df_sample['Review_Text'].tolist()
    
    # 결과를 저장할 리스트
    results = []
    
    # 각 텍스트 분석
    for i, text in enumerate(test_texts, 1):
        print(f"\nProcessing text {i}/{len(test_texts)}")
        similar_words = analyzer.find_similar_words(text, top_k=5)
        
        # 결과 저장
        result = {
            'text_id': i,
            'original_text': text,
            'rating': df_sample.iloc[i-1]['Rating']
        }
        
        # 상위 5개 단어의 정보 추가
        for j, word in enumerate(similar_words, 1):
            result[f'word_{j}'] = word['word']
            result[f'similarity_{j}'] = word['similarity']
            result[f'valence_{j}'] = word['valence']
            result[f'arousal_{j}'] = word['arousal']
        
        results.append(result)
    
    # 결과를 데이터프레임으로 변환
    df_results = pd.DataFrame(results)
    
    # CSV 파일로 저장
    output_path = r'C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\improved_sentiment_analysis_results.csv'
    df_results.to_csv(output_path, index=False, encoding='cp949')
    print(f"\nResults saved to: {output_path}")
    
    # 결과 미리보기
    print("\nFirst few rows of the results:")
    print(df_results.head())

if __name__ == "__main__":
    main()


Processing text 1/40

Processing text 2/40

Processing text 3/40

Processing text 4/40

Processing text 5/40

Processing text 6/40

Processing text 7/40

Processing text 8/40

Processing text 9/40

Processing text 10/40

Processing text 11/40

Processing text 12/40

Processing text 13/40

Processing text 14/40

Processing text 15/40

Processing text 16/40

Processing text 17/40

Processing text 18/40

Processing text 19/40

Processing text 20/40

Processing text 21/40

Processing text 22/40

Processing text 23/40

Processing text 24/40

Processing text 25/40

Processing text 26/40

Processing text 27/40

Processing text 28/40

Processing text 29/40

Processing text 30/40

Processing text 31/40

Processing text 32/40

Processing text 33/40

Processing text 34/40

Processing text 35/40

Processing text 36/40

Processing text 37/40

Processing text 38/40

Processing text 39/40

Processing text 40/40

Results saved to: C:\Users\Administrator\Desktop\PADA_LAB\sentimnet\improved_sentiment_a