In [None]:
%pip install emoji
%pip install pandas
%pip install openpyxl

In [None]:
import pandas as pd

data = pd.read_excel('scrapped_data.xlsx')
data

In [None]:
import emoji
import re

def split_text_and_symbols(text):
    if isinstance(text, str):
        words = re.findall(r'\b\w+\b', text)
        emoticon_set = set(char for char in text if char in emoji.EMOJI_DATA)
        emoticon_list = [emoji.emojize(emoticon) for emoticon in emoticon_set]
        sentence = ' '.join(words)

        return sentence, emoticon_list
    else:
        return '', [] 

In [None]:
dataset = pd.DataFrame()

dataset[['text', 'emoticon']] = data['content'].apply(split_text_and_symbols).apply(pd.Series)
dataset['rating'] = data['score']
dataset.head()

In [None]:
import pandas as pd

def load_lexicon_from_excel(file_path):
    try:
        df = pd.read_excel(file_path)
        lexicon = {row['emoticon'].strip().replace("'", ""): int(row['polarity'])
                   for _, row in df.iterrows()
                   if isinstance(row['polarity'], int)}
        return lexicon
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    except Exception as e:
        print("Error:", e)
        return None

def calculate_sentiment(symbols, lexicon):
    sentiment_score = 0
    unprocessable_emoticons = []
    if symbols:
        sentiment_score = sum(lexicon.get(char, 0) for char in symbols)
        unprocessable_emoticons = [char for char in symbols if char not in lexicon]
    return sentiment_score, unprocessable_emoticons

nama_file_lexicon = 'dataset/emoticon.xlsx'

lexicon = load_lexicon_from_excel(nama_file_lexicon)

if lexicon is not None:
    sentiment_data = dataset['emoticon'].apply(lambda x: calculate_sentiment(x, lexicon))
    dataset['polarity_emotion'] = [sent[0] if sent[1] != [] else 0 for sent in sentiment_data]
    dataset['unprocessable_emoticons'] = [sent[1] for sent in sentiment_data]

    total_emoticons = len(dataset)
    emoticons_processed = total_emoticons - sum(1 for emo_list in dataset['unprocessable_emoticons'] if emo_list)
    success_percentage = (emoticons_processed / total_emoticons) * 100
    failure_percentage = 100 - success_percentage

    print(f"Total emoticons: {total_emoticons}")
    print(f"Emoticons processed: {emoticons_processed} ({success_percentage:.2f}%)")
    print(f"Emoticons failed to process: {total_emoticons - emoticons_processed} ({failure_percentage:.2f}%)")
else:
    print("Failed to load lexicon. Cannot continue computation.")

dataset.head()

In [None]:
unique_emoticons = set()

for value in dataset['unprocessable_emoticons']:
    if isinstance(value, list) and value:
        unique_emoticons.update(value)

filtered_df = pd.DataFrame(list(unique_emoticons), columns=['emoticons'])

#simpan data emoticon yang tidak bisa klasifikasi
filtered_df.to_excel('gagal_proses/unique_emoticons.xlsx', index=False)

#hapus kolom unprocessable_emoticons tidak butuh
dataset.drop('unprocessable_emoticons', axis=1, inplace=True)
#hasil akhir menjadi dataset
dataset.to_excel('hasil/dataset.xlsx', index=False)

dataset.head()