In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# 下載 NLTK 資源
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# 初始化詞形還原器
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /Users/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alex/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# 將 NLTK 的詞性標籤轉換為 WordNet 的詞性標籤
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # 預設為名詞

# 改進的詞形還原函數
def lemmatize_words(words):
    lemmatized = set()
    for word in words:
        word = word.lower()  # 轉小寫
        tagged = nltk.pos_tag([word])
        wordnet_pos = get_wordnet_pos(tagged[0][1])  # 獲得詞的詞性
        lemmatized.add(lemmatizer.lemmatize(word, pos=wordnet_pos))  # 進行詞形還原
    return lemmatized

def load_sentiment_words(file_path):
    df = pd.read_excel(file_path)
    positive_words = set(df.iloc[:, 1].dropna().astype(str))  # 確保為字符串
    negative_words = set(df.iloc[:, 0].dropna().astype(str))  # 確保為字符串
    return positive_words, negative_words

def determine_sentiment(sentence, positive_words, negative_words):
    words = word_tokenize(sentence.lower())
    has_positive = any(word in positive_words for word in words)
    has_negative = any(word in negative_words for word in words)

    if has_negative:
        return 'Negative'
    elif has_positive and not has_negative:
        return 'Positive'
    else:
        return 'Neutral'

# 載入詞庫
positive_words, negative_words = load_sentiment_words('/Users/alex/Desktop/Assay code/content analysis/LM 字詞庫/LM字典情緒詞庫.xlsx')  # 更改為您的文件路徑
# 還原詞庫
positive_words = lemmatize_words(positive_words)
negative_words = lemmatize_words(negative_words)

# 測試句子
test_sentence = "By applying voice intelligence to all external and internal communications, Dialpad is enabling organizations to sell more effectively, conduct more efficient meetings, personalize the customer experience, and make smarter business decisions automatically, in real-time, without installing new software."
sentiment = determine_sentiment(test_sentence, positive_words, negative_words)
print(f"{sentiment}")


Positive


In [3]:
def analyze_excel(input_file, output_file):
    df = pd.read_excel(input_file)
    df['LM label'] = df['sentence'].apply(lambda x: determine_sentiment(x, positive_words, negative_words))
    df.to_excel(output_file, index=False)


analyze_excel('/Users/alex/Desktop/0511assay data測試/124 sample/情緒分析變數/Total_data_v4.xlsx', '/Users/alex/Desktop/0511assay data測試/124 sample/model label result.xlsx')

In [9]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

file_path = '/Users/alex/Desktop/0511assay data測試/124 sample/情緒分析變數/model label result.xlsx'
data = pd.read_excel(file_path)

# 計算不匹配的次數
mismatches = data['final label'] != data['LM label']
mismatch_count = mismatches.sum()
total_data = data['final label'] == data['final label']
data_count = total_data.sum()

# 計算準確率
accuracy = accuracy_score(data['final label'], data['LM label'])
# 計算精確率
precision = precision_score(data['final label'], data['LM label'], average='macro')
# 計算召回率
recall = recall_score(data['final label'], data['LM label'], average='macro')
# 計算 F1 分數
f1 = f1_score(data['final label'], data['LM label'], average='macro')

print(f"Number of data: {data_count}")
print(f"Number of mismatches: {mismatch_count}")
print(f"Accuracy of the model predictions: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Number of data: 2512
Number of mismatches: 1467
Accuracy of the model predictions: 0.42
Precision: 0.44
Recall: 0.47
F1 Score: 0.38
