In [1]:
!pip install -q contractions scikit-learn Sastrawi googletrans==4.0.0-rc1 langdetect pandas matplotlib yfinance tensorflow xgboost

# Import library
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import unicodedata
import yfinance as yf
import nltk
import re
import contractions
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from bs4 import BeautifulSoup
from sklearn.preprocessing import StandardScaler
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from wordcloud import WordCloud
from collections import Counter
from langdetect import detect
from googletrans import Translator
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow import keras
from math import sqrt

In [2]:
# Load  dataset
url = 'https://raw.githubusercontent.com/22bayusetia/PyCuan/main/Sentiment%20Analysis/data_finance.csv'
df = pd.read_csv(url, delimiter=',', encoding='latin-1', header=None)
df = df.drop(0)
df.columns = ['label', 'en_text', 'id_text']
# df.info()

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

indonesian_stopwords = set(nltk.corpus.stopwords.words('indonesian'))
factory1 = StopWordRemoverFactory()
stopword_sastrawi = factory1.create_stop_word_remover()
factory2 = StemmerFactory()
stemmer_sastrawi = factory2.create_stemmer()

# Fungsi preprocessing data
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def preprocess_text_sastrawi(text):
    # Melakukan preprocessing menggunakan Sastrawi
    tokens = nltk.word_tokenize(text)
    tokens = [stopword_sastrawi.remove(token) for token in tokens]
    return " ".join(tokens)

def pre_process_text(text, language):
    text = text.lower()
    text = strip_html_tags(text)
    text = text.translate(text.maketrans("\n\t\r", "   "))
    text = remove_accented_chars(text)
    text = contractions.fix(text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I | re.A)
    text = re.sub(' +', ' ', text)
    text = preprocess_text_sastrawi(text) if language == 'indonesian' else text
    return text

# Melakukan preprocessing pada data
df['en_text'] = df['en_text'].apply(lambda x: pre_process_text(x, 'english'))
df['id_text'] = df['id_text'].apply(lambda x: pre_process_text(x, 'indonesian'))
df = df.drop_duplicates(subset=['en_text', 'id_text'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


In [4]:
def perform_sentiment_analysis(df, narrative=None):
    # Upsample data
    data_majority = df[df['label'] == "positive"]
    data_minority = df[df['label'] == "negative"]

    data_minority_upsampled = resample(data_minority,
                                       replace=True,
                                       n_samples=data_majority.shape[0],
                                       random_state=123)

    df_balance_upsampled = pd.concat([data_majority, data_minority_upsampled])
    df_balanced_upsampled = df.drop_duplicates(subset=['en_text', 'id_text'])

    X_train, X_temp, y_train, y_temp = train_test_split(df_balance_upsampled.en_text,
                                                        df_balance_upsampled.label,
                                                        test_size=0.2,
                                                        random_state=42)

    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Konversi teks ke vektor fitur menggunakan TF-IDF
    tfidf_vectorizer = TfidfVectorizer(max_features=3000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf = tfidf_vectorizer.transform(X_val)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    rf_classifier.fit(X_train_tfidf, y_train)
    y_pred = rf_classifier.predict(X_test_tfidf)

    train_acc = rf_classifier.score(X_train_tfidf, y_train)
    val_acc = rf_classifier.score(X_val_tfidf, y_val)
    test_acc = rf_classifier.score(X_test_tfidf, y_test)

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    predicted_sentiments = rf_classifier.predict(X_test_tfidf)
    positive_percentage = (predicted_sentiments == 'positive').sum() / len(predicted_sentiments) * 100
    negative_percentage = 100 - positive_percentage

    # Deteksi bahasa narasi
    if narrative:
        translator = Translator()
        language = detect(narrative)
        if language != 'en':
            narrative_translated = translator.translate(narrative, dest='en').text
            narrative_tfidf = tfidf_vectorizer.transform([narrative_translated])
        else:
            narrative_tfidf = tfidf_vectorizer.transform([narrative])

        predicted_sentiment = rf_classifier.predict(narrative_tfidf)
        sentiment_probability = rf_classifier.predict_proba(narrative_tfidf)[0, 1]
        threshold = 0.5  # Tresholdnya diatur sesuai kebutuhan
        sentiment = "Positive" if sentiment_probability > threshold else "Negative"

        print("Text:", narrative)
        print("Sentiment Probability:", sentiment_probability)

    return train_acc, val_acc, test_acc, precision, recall, f1, positive_percentage, negative_percentage, sentiment_probability

In [5]:
# Fungsi untuk melakukan prediksi time series
def perform_time_series_forecasting(df, stock_symbol, start_date, end_date, seq_length=30, forecast_days=5):
    df_stock = yf.download(stock_symbol, start=start_date, end=end_date)
    ts = df_stock['Open'].values

    scaler = StandardScaler()
    ts_scaled = scaler.fit_transform(np.array(ts).reshape(-1, 1))

    X_train, y_train = [], []

    for i in range(len(ts_scaled) - seq_length):
        X_train.append(ts_scaled[i:i + seq_length])
        y_train.append(ts_scaled[i + seq_length])

    X_train, y_train = np.array(X_train), np.array(y_train)

    train_size = int(len(X_train) * 0.8)
    X_train, X_test = X_train[:train_size], X_train[train_size:]
    y_train, y_test = y_train[:train_size], y_train[train_size:]

    # model LSTM
    model = keras.Sequential([
        LSTM(128, activation='relu', return_sequences=True, input_shape=(seq_length, 1)),
        Dropout(0.2),
        LSTM(128, activation='relu', return_sequences=True),
        Dropout(0.2),
        LSTM(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])

    model.compile(optimizer='adam', loss='mean_squared_error')
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)

    # Generate forecast for the next 'forecast_days'
    X_forecast = np.copy(X_test[-1])
    forecasted_values = []

    for _ in range(forecast_days):
        forecasted_value = model.predict(X_forecast.reshape(1, seq_length, 1))
        forecasted_values.append(forecasted_value[0, 0])

        X_forecast = np.roll(X_forecast, -1)
        X_forecast[-1] = forecasted_value

    forecasted_values = scaler.inverse_transform(np.array(forecasted_values).reshape(-1, 1))

    # Evaluasi hasil prediksi
    rmse_test = sqrt(mean_squared_error(y_test, model.predict(X_test)))
    mae_test = mean_absolute_error(y_test, model.predict(X_test))

    weighted_metric = (rmse_test + mae_test) / 2
    last_date = df_stock.index[-1]
    forecast_dates = pd.date_range(last_date, periods=forecast_days + 1)[1:]

    last_actual_opening_price = df_stock['Open'][-1]
    first_forecast_opening_price = forecasted_values[0][0]
    price_difference = first_forecast_opening_price - last_actual_opening_price
    percentage_change = price_difference / last_actual_opening_price
    adjusted_percentage_change = (percentage_change + 1) / 2

    return weighted_metric, forecast_dates, forecasted_values, adjusted_percentage_change


In [6]:
def translate_to_english(narrative):
    translator = Translator()
    english_narrative = translator.translate(narrative, dest='en').text
    return english_narrative

# Input data testing
narrative = "Revisi Kebijakan Pupuk Bersubsidi, Kini Petani Dapat Tebus Pakai KTP"  # lebih bagus pakai b. inggris

stock_symbol = 'ETH-USD' # tambahkan .JK untuk bursa efek indonesia | -USD untuk global
start_date = '2022-12-05'
end_date = '2023-12-05'

narrative_english = translate_to_english(narrative)
time_series_results = perform_time_series_forecasting(df, stock_symbol, start_date, end_date)
time_series_weight = time_series_results[-1]

[*********************100%%**********************]  1 of 1 completed


In [9]:
sentiment_results = perform_sentiment_analysis(df, narrative=narrative_english)
train_acc, val_acc, test_acc, precision, recall, f1, positive_percentage, negative_percentage, sentiment_probability = sentiment_results
print("\nStock:", stock_symbol)
print("Time Series Probability:", time_series_weight)

Text: Revision of subsidized fertilizer policy, now farmers can redeem their KTP
Sentiment Probability: 0.67

Stock: ETH-USD
Time Series Probability: 0.4357646897775186


In [8]:
# Fungsi untuk mengkombinasikan bobot
def combine_weights(sentiment_probability, time_series_weight, sentiment_ratio=0.65):
    time_series_ratio = 1 - sentiment_ratio

    combined_weight = (sentiment_ratio * sentiment_probability + time_series_ratio * time_series_weight)
    return combined_weight

final_weight = combine_weights(sentiment_probability, time_series_weight)
final_sentiment = "Positive 🚀" if final_weight > 0.5 else "Negative 🌂"

print("Bobot:", final_weight)
print("Sentiment:", final_sentiment)

Bobot: 0.5880176414221315
Sentiment: Positive 🚀
