In [None]:
# !pip install numpy==1.26.4
# !pip install openai==1.63.2
# !pip install tenacity==8.2.3
# !pip install tiktoken==0.6.0
# !pip install transformers==4.34.1
# !pip install pandas==2.2.0
# !pip install scikit-learn==1.4.0
# !pip install torch
# !pip install bitsandbytes==0.42.0
# !pip install datasets==2.14.7
# !pip install sentencepiece
# !pip install peft==0.6.2
# !pip install evaluate==0.4.1
# !pip install trl==0.7.1
# !pip install protobuf==4.25.2
# !pip install python-dotenv
# !pip install pandas_ta
# !pip install ollama


In [None]:
# !pip uninstall torch torchvision torchaudio -y
# !pip install torch==2.2.0+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [3]:
import os
import pandas as pd
import pandas_ta as ta  # Sử dụng pandas_ta vì dễ cài đặt và tích hợp với pandas
import numpy as np

In [16]:

def format_technical_indicators(row):
    """
    Chuyển đổi một dòng của DataFrame thành văn bản, bỏ qua giá trị NaN.

    Args:
        row (pd.Series): Một dòng của DataFrame chứa các chỉ số kỹ thuật.

    Returns:
        str: Văn bản chứa chỉ số kỹ thuật, mỗi dòng cách nhau bởi '\n'.
    """
    lines = []
    
    for col, value in row.items():
        if pd.notna(value):  # Bỏ qua nếu giá trị là NaN
            lines.append(f"{col}: {value:,}")  # Định dạng số với dấu phẩy
    
    return "\n".join(lines)  # Kết hợp các dòng thành văn bản


# Đọc file CSV vào DataFrame
data = pd.read_csv("data_sample2.csv")


def add_technical_indicator(data):
    # Lấy danh sách các giá trị duy nhất của cột "ticker"
    unique_tickers = data["ticker"].unique()
    unique_tickers

    technical_indicator_dir = "data/sample_price/technical_indicator/"
    # DataFrame để lưu kết quả cuối cùng
    final_df = pd.DataFrame()


    for file_name in unique_tickers:

        technical_indicator_path = os.path.join(technical_indicator_dir, file_name + ".csv")

        df_technical_indicator = pd.read_csv(technical_indicator_path)

        # Loại bỏ cột 'Date' để chỉ giữ các chỉ báo kỹ thuật
        df_technical_indicator["technical_indicator"] = df_technical_indicator.drop(columns=["Date"]).apply(format_technical_indicators, axis=1)

        # Thêm cột "ticker" với giá trị file_name
        df_technical_indicator["ticker"] = file_name

        # Hiển thị kết quả  
        df_technical_indicator = df_technical_indicator[["ticker", "Date", "technical_indicator"]].rename(columns={"Date": "date"})
        # Merge theo 2 cột "ticker" và "Date"
        merged_df = pd.merge(data, df_technical_indicator, on=["ticker", "date"], how="inner")

        # Cộng dồn kết quả
        final_df = pd.concat([final_df, merged_df], ignore_index=True)

    return final_df

    

data = add_technical_indicator(data)
data

data.to_csv("data_sample2.csv", index=False)


In [25]:
## Tính Chỉ Báo Kỹ Thuật


def calculate_technical_indicators(data):
    """
    Tính toán các chỉ số kỹ thuật phổ biến cho dữ liệu chứng khoán.

    Args:
        data (pandas.DataFrame): DataFrame chứa dữ liệu chứng khoán (Open, High, Low, Close, Volume).
                                 Yêu cầu các cột 'Open', 'High', 'Low', 'Close', 'Volume'.

    Returns:
        pandas.DataFrame: DataFrame chứa dữ liệu gốc và các chỉ số kỹ thuật.
    """

    # 1. Moving Averages
    data['SMA_5'] = ta.sma(data['Close'], length=5)
    data['EMA_5'] = ta.ema(data['Close'], length=5)

    # 2. MACD
    macd = ta.macd(data['Close'], fast=12, slow=26, signal=9)
    data['MACD'] = macd['MACD_12_26_9']  # Lấy đường MACD
    data['MACD_SIGNAL'] = macd['MACDs_12_26_9']  # Lấy đường tín hiệu
    data['MACD_HIST'] = macd['MACDh_12_26_9']  # Lấy histogram

    # 3. RSI
    data['RSI'] = ta.rsi(data['Close'], length=5)

    # 4. Bollinger Bands
    bbands = ta.bbands(data['Close'], length=20, std=2)
    data['BB_UPPER'] = bbands['BBU_20_2.0']
    data['BB_LOWER'] = bbands['BBL_20_2.0']
    data['BB_MIDDLE'] = bbands['BBM_20_2.0']

    # 5. Volume Indicators (OBV)
    data['OBV'] = ta.obv(data['Close'], data['Volume'])

    # 6. ADX
    adx = ta.adx(data['High'], data['Low'], data['Close'], length=14)
    data['ADX'] = adx['ADX_14']
    data['DMP'] = adx['DMP_14']  # Positive Directional Movement
    data['DMN'] = adx['DMN_14']  # Negative Directional Movement

    data = data.drop(columns = ["Open","High","Low","Close", "Adj Close","Volume"]) 
    return data.round(2)


folder_path = 'data/sample_price/raw/'
technical_indicator_folder = 'data/sample_price/technical_indicator/'  # Thư mục lưu file sau khi xử lý
# Tạo thư mục technical_indicator_folder nếu chưa có
os.makedirs(technical_indicator_folder, exist_ok=True)


# Kiểm tra thư mục có tồn tại không
if os.path.exists(folder_path):
    # Duyệt qua từng file trong thư mục
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        technical_indicator_path = os.path.join(technical_indicator_folder, file)  # Đường dẫn file mới

        # Kiểm tra nếu file là CSV
        if file.endswith('.csv'):
            print(f"Đang mở: {file}")

            # Đọc file CSV vào DataFrame
            df = pd.read_csv(file_path)
            # print(df.head())  # In 5 dòng đầu của file

            df = calculate_technical_indicators(df)

            # Ghi file vào thư mục processed
            df.to_csv(technical_indicator_path, index=False)
            print(f"Đã lưu: {technical_indicator_path}")
        # break
else:
    print("Thư mục không tồn tại.")

Đang mở: AAPL.csv
Đã lưu: data/sample_price/technical_indicator/AAPL.csv
Đang mở: ABBV.csv
Đã lưu: data/sample_price/technical_indicator/ABBV.csv
Đang mở: AEP.csv
Đã lưu: data/sample_price/technical_indicator/AEP.csv
Đang mở: AMT.csv
Đã lưu: data/sample_price/technical_indicator/AMT.csv
Đang mở: AMZN.csv
Đã lưu: data/sample_price/technical_indicator/AMZN.csv
Đang mở: APD.csv
Đã lưu: data/sample_price/technical_indicator/APD.csv
Đang mở: AVGO.csv
Đã lưu: data/sample_price/technical_indicator/AVGO.csv
Đang mở: BABA.csv
Đã lưu: data/sample_price/technical_indicator/BABA.csv
Đang mở: BAC.csv
Đã lưu: data/sample_price/technical_indicator/BAC.csv
Đang mở: BHP.csv
Đã lưu: data/sample_price/technical_indicator/BHP.csv
Đang mở: BRK-A.csv
Đã lưu: data/sample_price/technical_indicator/BRK-A.csv
Đang mở: CAT.csv
Đã lưu: data/sample_price/technical_indicator/CAT.csv
Đang mở: CCI.csv
Đã lưu: data/sample_price/technical_indicator/CCI.csv
Đang mở: CMCSA.csv
Đã lưu: data/sample_price/technical_indicato

In [1]:
%run testmain.py

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
Args in experiment:
Loading Train Agents...
AMZN = ticker, tweet_data =  ['RT @IBDinvestors: $AMZN "With this... we had to back away yesterday bc it broke our expectations by falling below the 50-day. Don\'t get dis…', 'RT @DeItaOne: $AMZN BANS N95, SURGICAL MASK SALES TO GENERAL PUBLIC: RECODE', "And that's bullish, GTFOH $ES $ES_F $SPX $SPY $QQQ $IWM $VXX $VIX $AAPL $AMZN $MSFT https://t.co/TnRBO8RfWm", 'RT @DeItaOne: AMAZON SAYS IT HAS FILLED 80,000 OF THE 100,000 JOBS IT ANNOUNCED LAST MONTH\n$AMZN', 'BOOM 💰💰 Come Join US \n$fb $aapl $amzn $nflx $googl $bidu $roku $spy $amd $nvda $tsla $ba $baba $shop For  Daily Powerful Watchlist, Swing &amp; Day Option Trading Alerts  Paypal monthly link in bio, $149.99 DM for biweekly link $84.99 #trading #optionstrader #Money https://t.co/eULM8jNbkw', 'RT @arnabch01: @FriseSally @Ma

In [2]:
import pandas as pd
# Đọc tệp CSV
df_loaded = pd.read_csv("data_sample2.csv")

# Hiển thị nội dung DataFrame
print("Nội dung tệp CSV:")
# print(df_loaded)
df_loaded['summary']
df_loaded = df_loaded[:2]
df_loaded

Nội dung tệp CSV:


Unnamed: 0,ticker,summary,target,date,technical_indicator
0,AMZN,2020-04-02\nAmazon (AMZN) banned N95 and surgi...,Positive,2020-04-07,SMA_5: 97.42\nEMA_5: 98.33\nMACD: 0.73\nMACD_S...
1,AMZN,2020-04-15\nThe Pentagon awarded a $10 billion...,Negative,2020-04-17,SMA_5: 115.43\nEMA_5: 115.08\nMACD: 5.54\nMACD...


In [3]:
from explain_module.util import summarize_trial, remove_reflections, save_results#, save_agents
from explain_module.agents import PredictReflectAgent

import os, json

agent_cls = PredictReflectAgent
agents = [agent_cls(row['ticker'], row['summary'], row['target'], row['technical_indicator']) for _, row in df_loaded.iterrows()]
print("Loaded Train Agents.")
agents

for agent in agents:
    agent.run()
    # break

    if agent.is_correct():
        prompt = agent._build_agent_prompt()
        response = agent.scratchpad.split('Price Movement: ')[-1]
        sample = {"instruction": prompt, "input": "", "output": response}

print("aslmjdlasmdl")
correct, incorrect = summarize_trial(agents)
print(f'Finished Trial 0, Correct: {len(correct)}, Incorrect: {len(incorrect)}')
        

Loaded Train Agents.
self.target, self.prediction Positive  ///////// 
self.is_finished() =  False       not self.is_correct() =  True
Facts:
2020-04-02
Amazon (AMZN) banned N95 and surgical mask sales to the general public due to supply shortages caused by the COVID-19 pandemic. Additionally, Amazon announced that it has filled 80,000 out of the 100,000 jobs it announced last month.

2020-04-05
The tweets are not relevant or specific to the AMZN (Amazon) stock.

Technical Indicators:
SMA_5: 97.42
EMA_5: 98.33
MACD: 0.73
MACD_SIGNAL: -0.1
MACD_HIST: 0.83
RSI: 70.53
BB_UPPER: 102.76
BB_LOWER: 85.13
BB_MIDDLE: 93.94
OBV: 396,412,000.0
ADX: 15.59
DMP: 27.43
DMN: 17.77

Price Movement: Positive  
Explanation: Amazon reported significant supply chain challenges affecting N95 and surgical mask availability due to COVID-19, which could impact demand but was offset by news of job fillings. The strong upward trend indicated by the technical indicators—MACD above the signal line with positive hi

## test_explain

In [3]:
%run test_explain.py

Args in experiment:
Nội dung tệp CSV:
Loaded Train Agents.
Facts:
2020-04-02
Amazon (AMZN) recently banned the sale of N95 and surgical masks to the general public. 
Amazon has already filled 80,000 of the 100,000 jobs it announced last month.

Price Movement: 

RetryError: RetryError[<Future at 0x2370dceeaf0 state=finished raised OpenAIError>]

In [3]:
import json

with open("./data/merge_sample.json", "r", encoding="utf-8") as file:
    data = [json.loads(line) for line in file]

# print(data)  # Output: [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]
print(data[4]['instruction'])



Given a list of facts, estimate their overall impact on the price movement of AMZN stock. Give your response in this format:
(1) Price Movement, which should be either Positive or Negative.
(2) Explanation, which should be in a single, short paragraph.
Here are some examples:
Facts:
2016-07-26
Apple reported Q3 2016 earnings: Revenue of $42.4 billion, beating expectations. They sold 40.4 million iPhones, 9.9 million iPads, and 4.2 million Macs during that quarter.
Apple's earnings beat expectations, causing the stock to rise by almost 5% in after-hours trading.
Apple had $231.5 billion in cash reserves, enough to potentially acquire companies like Uber, Tesla, Twitter, Airbnb, Netflix, Snapchat, and SpaceX and still have billions left.
Apple's China sales were down around 29% sequentially and 33% YoY.
Despite declining unit sales, Apple's revenue was boosted by more expensive iPad Pro models.
Apple Pay accounted for 3/4 of contactless payments in the US.
Apple's services business (App 

In [10]:
import json

with open("./datasets/comparison_data.json", "r", encoding="utf-8") as file:
    datasets = [json.loads(line) for line in file]

datasets

[[{'user_input': "Given a list of facts, estimate their overall impact on the price movement of AMZN stock. Give your response in this format:\n(1) Price Movement, which should be either Positive or Negative.\n(2) Explanation, which should be in a single, short paragraph.\nHere are some examples:\nFacts:\n2016-07-26\nApple reported Q3 2016 earnings: Revenue of $42.4 billion, beating expectations. They sold 40.4 million iPhones, 9.9 million iPads, and 4.2 million Macs during that quarter.\nApple's earnings beat expectations, causing the stock to rise by almost 5% in after-hours trading.\nApple had $231.5 billion in cash reserves, enough to potentially acquire companies like Uber, Tesla, Twitter, Airbnb, Netflix, Snapchat, and SpaceX and still have billions left.\nApple's China sales were down around 29% sequentially and 33% YoY.\nDespite declining unit sales, Apple's revenue was boosted by more expensive iPad Pro models.\nApple Pay accounted for 3/4 of contactless payments in the US.\nA

In [2]:
from datasets import load_dataset
DATA_PATH ='./data/merge_sample.json'
data = load_dataset("json", data_files=DATA_PATH)
data

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 7
    })
})

In [3]:
dataset_name = "./datasets/"
train_dataset = load_dataset(dataset_name, split="train")
train_dataset

Generating train split: 1 examples [00:00, 36.22 examples/s]


Dataset({
    features: ['user_input', 'completion_a', 'completion_b'],
    num_rows: 1
})

## test_predict

In [None]:
%run test_predict.py

Args in experiment:


  from .autonotebook import tqdm as notebook_tqdm
  warn(
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Loading model from lmsys/vicuna-7b-v1.5-16k on CPU...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards:   0%|          | 0/2 [03:08<?, ?it/s]


KeyboardInterrupt: 

In [1]:
import ollama

class DeepSeekLLM:
    def __init__(self, model="deepseek-r1"):
        self.model = model

    def __call__(self, prompt):
        response = ollama.chat(model=self.model, messages=[{"role": "user", "content": prompt}])
        content = response["message"]["content"]
        
        # Cắt từ vị trí của </think>
        if "</think>" in content:
            content = content.split("</think>", 1)[-1].strip()
        
        return content

# Ví dụ sử dụng
if __name__ == "__main__":
    deepseek = DeepSeekLLM()
    question = "What is the capital of France?"
    answer = deepseek(question)
    print(answer)


The capital of France is Paris.


In [1]:
import ollama

desiredModel = 'deepseek-r1'
# questionToAsk = 'How to solve a quadratic equation x^2+5*x+6=0'
questionToAsk = ''' How many tokens can deepseek r1 7b receive each time?'''


response = ollama.chat(model=desiredModel, messages=[
    {
        'role': 'user',
        'content': questionToAsk,
    },
])

OllamaResponse = response['message']['content']

print(OllamaResponse)

# with open("OutputOllama.txt", "w", encoding="utf-8") as text_file:
#     text_file.write(OllamaResponse)

<think>

</think>

DeepSeek-R1-7B is a large language model that processes text in real-time, but the number of tokens it can generate or process in one go is limited only by the available system resources (such as memory and computational power). In practice, models like DeepSeek-R1-7B typically generate outputs in fixed-length chunks, often around 500-1000 tokens, depending on the specific configuration.

If you're using a framework like the DeepSeek Engine or another platform that hosts DeepSeek-R1-7B, we recommend checking their documentation for specific details about token limits and usage policies. Always ensure you adhere to their guidelines to avoid any issues with your usage.
