In [4]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import csv

class LexiconProcessor:
    def __init__(self, model_name: str = 'bert-base-uncased'):
        # BERT 모델 초기화
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()
        
    def get_embedding(self, word: str) -> np.ndarray:
        """단어의 BERT 임베딩을 계산"""
        inputs = self.tokenizer(
            word,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[:, 0, :]
        
        return embedding.cpu().numpy()[0]

    def process_lexicon(self, vad_lexicon_path: str, output_file: str):
        """VAD 렉시콘의 단어들을 처리하여 임베딩과 VA 값을 저장"""
        # VAD 렉시콘 읽기
        word_data = {}
        with open(vad_lexicon_path, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) == 4:
                    word = parts[0].lower()
                    valence, arousal = float(parts[1]), float(parts[2])
                    word_data[word] = (valence, arousal)
        
        # 각 단어에 대한 임베딩 계산 및 저장
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['word', 'valence', 'arousal', 'embedding'])
            
            for word, (valence, arousal) in word_data.items():
                try:
                    embedding = self.get_embedding(word)
                    writer.writerow([
                        word,
                        valence,
                        arousal,
                        embedding.tolist()
                    ])
                    print(f"Processed: {word}")
                except Exception as e:
                    print(f"Error processing word '{word}': {e}")

def main():
    # 설정
    vad_lexicon_path = "Bipolar_Lexicon.txt"
    output_file = "lexicon_embeddings.csv"
    
    # 렉시콘 처리
    processor = LexiconProcessor()
    processor.process_lexicon(vad_lexicon_path, output_file)
    print("Processing completed!")

if __name__ == "__main__":
    main()

Processed: aaaaaaah
Processed: aaaah
Processed: aardvark
Processed: aback
Processed: abacus
Processed: abalone
Processed: abandon
Processed: abandoned
Processed: abandonment
Processed: abashed
Processed: abate
Processed: abatement
Processed: abba
Processed: abbey
Processed: abbot
Processed: abbreviate
Processed: abbreviation
Processed: abdomen
Processed: abdominal
Processed: abduct
Processed: abduction
Processed: aberrant
Processed: aberration
Processed: abeyance
Processed: abhor
Processed: abhorrence
Processed: abhorrent
Processed: abide
Processed: abiding
Processed: ability
Processed: abject
Processed: ablation
Processed: ablaze
Processed: able
Processed: abnormal
Processed: abnormality
Processed: aboard
Processed: abode
Processed: abolish
Processed: abolition
Processed: abominable
Processed: abominate
Processed: abomination
Processed: aboriginal
Processed: abort
Processed: abortion
Processed: abortive
Processed: abound
Processed: abovementioned
Processed: abracadabra
Processed: abra

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import pandas as pd

class va_scorer:
    def __init__(self, vad_lexicon_path):
        self.vad_dict = {}
        self._load_lexicon(vad_lexicon_path)

    def _load_lexicon(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) == 4:
                    word = parts[0]
                    scores = [float(score) for score in parts[1:]]
                    self.vad_dict[word] = {
                        'valence': scores[0],
                        'arousal': scores[1],
                        'dominance': scores[2]
                    }

    def get_vad_scores(self, word):
        return self.vad_dict.get(word.lower())

def analyze_emotion_labels():
    # 감정 라벨 리스트
    emotion_labels = [
        'disappointment', 'annoyance', 'sadness', 'disapproval', 'love',
        'anger', 'realization', 'neutral', 'approval', 'disgust',
        'joy', 'admiration', 'embarrassment', 'amusement', 'confusion',
        'nervousness', 'remorse', 'optimism', 'desire', 'relief',
        'caring', 'excitement', 'grief', 'surprise', 'curiosity',
        'pride', 'fear', 'gratitude'
    ]

    # VA Scorer 초기화
    scorer = va_scorer('Bipolar_Lexicon.txt')

    # 결과를 저장할 리스트
    results = []

    # 각 라벨에 대한 VA 값 찾기
    for label in emotion_labels:
        scores = scorer.get_vad_scores(label)
        if scores:
            results.append({
                'emotion': label,
                'valence': scores['valence'],
                'arousal': scores['arousal']
            })
        else:
            # 레이블이 사전에 없는 경우 NaN으로 처리
            results.append({
                'emotion': label,
                'valence': float('nan'),
                'arousal': float('nan')
            })

    # DataFrame 생성
    df = pd.DataFrame(results)
    
    # NaN 값이 있는 행 확인
    missing_labels = df[df.isna().any(axis=1)]['emotion'].tolist()
    if missing_labels:
        print("Following labels were not found in the lexicon:", missing_labels)

    # 결과 저장
    df.to_csv('emotion_va_scores.csv', index=False)
    
    return df

# 실행
df = analyze_emotion_labels()
print("\nEmotion VA Scores DataFrame:")
df.to_csv('emotion_va_scores.csv', index=False)


Emotion VA Scores DataFrame:


In [3]:
import pandas as pd

def calculate_weighted_va(emotion_scores_list, va_scores_file):
    """
    감정 점수와 VA 점수를 이용하여 가중 평균 VA 값을 계산합니다.
    
    Args:
        emotion_scores_list (list): 감정 분석 결과 리스트
        va_scores_file (str): VA 점수가 저장된 CSV 파일 경로
    
    Returns:
        tuple: (최종 valence 값, 최종 arousal 값)
    """
    # VA 점수 읽기
    va_df = pd.read_csv(va_scores_file)
    
    # 가중 평균 계산을 위한 변수 초기화
    weighted_valence = 0
    weighted_arousal = 0
    total_weight = 0
    
    # 각 감정에 대해 가중 평균 계산
    for emotion in emotion_scores_list:
        label = emotion['label']
        score = emotion['score']
        
        va_row = va_df[va_df['emotion'] == label]
        if not va_row.empty and not va_row['valence'].isna().any() and not va_row['arousal'].isna().any():
            valence = va_row['valence'].iloc[0]
            arousal = va_row['arousal'].iloc[0]
            
            weighted_valence += valence * score
            weighted_arousal += arousal * score
            total_weight += score
    
    # 최종 VA 값 계산
    if total_weight > 0:
        final_valence = weighted_valence / total_weight
        final_arousal = weighted_arousal / total_weight
        return final_valence, final_arousal
    else:
        return None, None

# 감정 점수 데이터
emotion_scores = [
    {'label': 'disappointment', 'score': 0.6073453426361084},
    {'label': 'annoyance', 'score': 0.29090917110443115},
    {'label': 'sadness', 'score': 0.11545353382825851},
    {'label': 'disapproval', 'score': 0.09492042660713196},
    {'label': 'love', 'score': 0.0554782971739769},
    {'label': 'anger', 'score': 0.032416511327028275},
    {'label': 'realization', 'score': 0.026335157454013824},
    {'label': 'neutral', 'score': 0.026163052767515182},
    {'label': 'approval', 'score': 0.025927066802978516},
    {'label': 'disgust', 'score': 0.01680212840437889},
    {'label': 'joy', 'score': 0.014748722314834595},
    {'label': 'admiration', 'score': 0.01159504521638155}
]

# 계산 실행
valence, arousal = calculate_weighted_va(emotion_scores, 'emotion_va_scores.csv')
print(f"Weighted VA scores for the sentence:")
print(f"Valence: {valence:.3f}")
print(f"Arousal: {arousal:.3f}")

Weighted VA scores for the sentence:
Valence: -0.592
Arousal: 0.079


In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
import csv

class LexiconProcessor:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        # Sentence Transformer 모델 초기화
        self.model = SentenceTransformer(model_name)
    
    def get_embedding(self, word: str) -> np.ndarray:
        """단어의 Sentence Transformer 임베딩을 계산"""
        embedding = self.model.encode(word)
        return embedding
    
    def process_lexicon(self, vad_lexicon_path: str, output_file: str):
        """VAD 렉시콘의 단어들을 처리하여 임베딩과 VA 값을 저장"""
        # VAD 렉시콘 읽기
        word_data = {}
        with open(vad_lexicon_path, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) == 4:  # word, valence, arousal, dominance
                    word = parts[0].lower()
                    valence, arousal = float(parts[1]), float(parts[2])
                    word_data[word] = (valence, arousal)
        
        # 각 단어에 대한 임베딩 계산 및 저장
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['word', 'valence', 'arousal', 'embedding'])
            
            # 배치 처리를 위한 준비
            batch_size = 32
            words = list(word_data.keys())
            
            for i in range(0, len(words), batch_size):
                batch_words = words[i:i + batch_size]
                try:
                    # 배치 단위로 임베딩 계산
                    embeddings = self.model.encode(batch_words)
                    
                    # 각 단어와 임베딩 저장
                    for word, embedding in zip(batch_words, embeddings):
                        valence, arousal = word_data[word]
                        writer.writerow([
                            word,
                            valence,
                            arousal,
                            embedding.tolist()
                        ])
                        print(f"Processed: {word}")
                except Exception as e:
                    print(f"Error processing batch starting with word '{batch_words[0]}': {e}")

def main():
    # 설정
    vad_lexicon_path = "Bipolar_Lexicon.txt"
    output_file = r"C:\Users\Administrator\Desktop\lexicon_embeddings_ver2.csv"
    
    # 렉시콘 처리
    processor = LexiconProcessor()
    processor.process_lexicon(vad_lexicon_path, output_file)
    print("Processing completed!")

if __name__ == "__main__":
    main()

Processed: aaaaaaah
Processed: aaaah
Processed: aardvark
Processed: aback
Processed: abacus
Processed: abalone
Processed: abandon
Processed: abandoned
Processed: abandonment
Processed: abashed
Processed: abate
Processed: abatement
Processed: abba
Processed: abbey
Processed: abbot
Processed: abbreviate
Processed: abbreviation
Processed: abdomen
Processed: abdominal
Processed: abduct
Processed: abduction
Processed: aberrant
Processed: aberration
Processed: abeyance
Processed: abhor
Processed: abhorrence
Processed: abhorrent
Processed: abide
Processed: abiding
Processed: ability
Processed: abject
Processed: ablation
Processed: ablaze
Processed: able
Processed: abnormal
Processed: abnormality
Processed: aboard
Processed: abode
Processed: abolish
Processed: abolition
Processed: abominable
Processed: abominate
Processed: abomination
Processed: aboriginal
Processed: abort
Processed: abortion
Processed: abortive
Processed: abound
Processed: abovementioned
Processed: abracadabra
Processed: abra

# 아마존 데이터 감정 분석

In [6]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Dict

class SimplifiedSentimentMapper:
    def __init__(self, model_name: str = 'bert-base-uncased', lexicon_embeddings_path: str = 'lexicon_embeddings.csv'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()
        
        # 렉시콘 임베딩 로드
        self.lexicon_data = pd.read_csv(lexicon_embeddings_path)
        self.lexicon_embeddings = np.array([np.array(eval(emb)) for emb in self.lexicon_data['embedding']])
    
    def get_cls_embedding(self, text: str) -> np.ndarray:
        # 전처리 및 토큰화
        inputs = self.tokenizer(
            text.lower(),
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
        
        return cls_embedding
    
    def find_most_similar_word(self, cls_embedding: np.ndarray) -> Dict:
        """CLS 임베딩과 가장 유사한 단어 하나를 찾음"""
        similarities = cosine_similarity([cls_embedding], self.lexicon_embeddings)[0]
        top_idx = np.argmax(similarities)
        
        return {
            'word': self.lexicon_data.iloc[top_idx]['word'],
            'valence': self.lexicon_data.iloc[top_idx]['valence'],
            'arousal': self.lexicon_data.iloc[top_idx]['arousal'],
            'similarity': similarities[top_idx]
        }
    
    def get_sentiment_coordinates(self, text: str) -> Tuple[float, float, Dict]:
        """문장의 CLS 임베딩을 기반으로 VA 좌표와 가장 유사한 단어 반환"""
        cls_embedding = self.get_cls_embedding(text)
        most_similar = self.find_most_similar_word(cls_embedding)
        
        return most_similar['valence'], most_similar['arousal'], most_similar
    
    def process_reviews(self, reviews: List[str], output_file: str = 'review_va_coordinates.csv'):
        """여러 리뷰 텍스트를 처리하여 VA 좌표 저장"""
        results = []
        for i, review in enumerate(reviews):
            print(f"Processing review {i+1}/{len(reviews)}")
            valence, arousal, similar_word = self.get_sentiment_coordinates(review)
            results.append({
                'text': review,
                'valence': valence,
                'arousal': arousal,
                'most_similar_word': similar_word['word'],
                'similarity_score': similar_word['similarity']
            })
        
        df = pd.DataFrame(results)
        df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
        return df

def main():
    # ContextAwareSentimentMapper 초기화
    mapper = SimplifiedSentimentMapper(lexicon_embeddings_path=r"C:\Users\Administrator\Desktop\lexicon_embeddings.csv")
    
    print("텍스트 분석 방식을 선택하세요:")
    print("1. CSV 파일 분석")
    print("2. 테스트 텍스트 분석")
    choice = input("선택 (1 또는 2): ")
    
    if choice == '1':
        # CSV 파일 경로 입력
        file_path_choose = input("원하는 csv 파일을 선택하세요:")
        print("1. Amazon Reviews")
        print("2. Audible Reviews")
        print("3. Hotel Reviews")
        print("4. Coursera Reviews")
        if file_path_choose == '1':
            file_path = r"C:\Users\Administrator\Desktop\PADA_LAB\calculated\final\amazon_reviews_with_topics_and_depth_breadth.csv"
        elif file_path_choose == '2':
            file_path = r"C:\Users\Administrator\Desktop\PADA_LAB\calculated\final\audible_reviews_with_topics_and_depth_breadth.csv"
        elif file_path_choose == '3':
            file_path = r"C:\Users\Administrator\Desktop\PADA_LAB\calculated\final\hotel_reviews_with_topics_and_depth_breadth.csv"
        elif file_path_choose == '4':
            file_path = r"C:\Users\Administrator\Desktop\PADA_LAB\calculated\final\coursera_reviews_with_topics_and_depth_breadth.csv"  
        column_name = "Review_Text"
        
        try:
            # 데이터프레임 로드
            df = pd.read_csv(file_path)
            reviews = df[column_name].tolist()
            
            # VA 좌표 계산 및 결과 저장
            print(f"\nTotal reviews to process: {len(reviews)}")
            results = mapper.process_reviews(reviews)
            print("\nProcessing completed!")
            
            # 원본 데이터프레임에 VA 좌표 추가
            df['valence'] = results['valence']
            df['arousal'] = results['arousal']
            
            # 결과 저장
            output_path = file_path.replace('.csv', '_with_va_score.csv')
            df.to_csv(output_path, index=False)
            print(f"Results saved to {output_path}")
            
        except Exception as e:
            print(f"오류 발생: {str(e)}")
    
    elif choice == '2':
        # 테스트용 텍스트 목록
        test_texts = [
                "The course is well paced and they get you comfortable with the topics even though we do not have any sort of prior exposure in this field. It is very good for the beginners who are new to this field",
                "Information was well organized, easy to learn, and study. with frequent note writing, and some breaks . You can learn a good brief summary of what's to come, and what to research more in the future.",
                "A lot of explanations not provided! Too many peer-graded assignments, really disappointed!",
                "I enjoyed both The Martian and Artemis so I preordered this one and started it immediately It did not disappoint Andy Weir is one of my favorite authors and Ray Porter is one of my favorite narrators so this combination is a winwin The narration is superb and the writing is great I recommend this book Dont over think it This is worth the price of admissionDisclaimer My enjoyment of the narrator is based on my listening speed I only leave 5 stars for books Ive listened to or will listen to multiple times",
                "Great story but a little winded Interesting story and great concept, but it dragged a little too much in the middle I love a long book but it wasnt really adding much value"
        ]
        
        # VA 좌표 계산
        print(f"\nTotal texts to process: {len(test_texts)}")
        results = mapper.process_reviews(test_texts)
    
        print("\nAnalysis Results:")
        for i, row in results.iterrows():
            print(f"\nText {i+1}: {row['text'][:100]}...")
            print(f"Most similar word: {row['most_similar_word']} (similarity: {row['similarity_score']:.4f})")
            print(f"Valence: {row['valence']:.3f}")
            print(f"Arousal: {row['arousal']:.3f}")

if __name__ == "__main__":
    main()

텍스트 분석 방식을 선택하세요:
1. CSV 파일 분석
2. 테스트 텍스트 분석

Total texts to process: 5
Processing review 1/5
Processing review 2/5
Processing review 3/5
Processing review 4/5
Processing review 5/5
Results saved to review_va_coordinates.csv

Analysis Results:

Text 1: The course is well paced and they get you comfortable with the topics even though we do not have any...
Most similar word: motivational (similarity: 0.7716)
Valence: 0.812
Arousal: 0.292

Text 2: Information was well organized, easy to learn, and study. with frequent note writing, and some break...
Most similar word: particulars (similarity: 0.7718)
Valence: 0.208
Arousal: -0.304

Text 3: A lot of explanations not provided! Too many peer-graded assignments, really disappointed!...
Most similar word: overworked (similarity: 0.8551)
Valence: -0.734
Arousal: 0.340

Text 4: I enjoyed both The Martian and Artemis so I preordered this one and started it immediately It did no...
Most similar word: goodtimes (similarity: 0.8404)
Valence: 0.820
A