## Part 1. Data Preprocessing

In [1]:
# Import packages and functions for part 1
import pandas as pd
from ckiptagger import WS, POS, NER
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy import sparse

# 先把我們需要的函數載入
ws = WS("./data_ckip") # 斷詞
pos = POS("./data_ckip") # 詞性標注
ner = NER("./data_ckip") # 命名實體識別

  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
2024-04-21 22:44:00.893601: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)


使用中研院 [CKIP Tagger](https://github.com/ckiplab/ckiptagger) 進行斷詞、詞性標記，篩選出文章中的**普通名詞**、**動詞**與**形容詞**。

In [None]:
def clean(sentence_ws, sentence_pos):
    '''
    - sentence_ws: 經過斷詞的句子
    - sentence_pos: 經過詞性標注的句子
    - 留下特定的詞性（名詞、動詞、形容詞）、排除一個字的詞、專有名詞
    '''
    cleaned = []
    for word, pos in zip(sentence_ws, sentence_pos):
        is_Na_or_V_or_A_or_D = pos.startswith("Na") or pos.startswith("V") or pos.startswith("A")
        if is_Na_or_V_or_A_or_D and len(word) > 1:
            cleaned.append(word)
    return " ".join(cleaned)

# Word segmentation
def word_segmentation(contents, ws_driver, pos_driver, ner_driver):
    '''
    - contents: 一個 list，每個元素是一篇文章的全文內容
    - ws: 斷詞模型
    '''
    ws_results = ws_driver(contents)
    pos_results = pos_driver(ws_results)
    contents_cleaned = []
    for sentence, sentence_ws, sentence_pos in zip(contents, ws_results, pos_results):
        sentence_cleaned = clean(sentence_ws, sentence_pos)
        contents_cleaned.append(sentence_cleaned)

    return contents_cleaned

In [None]:
data_news = pd.read_csv('../data/news_filtered_merged.csv')
contents = data_news['content'].tolist()
contents_cleaned = word_segmentation(contents, ws, pos, ner)
# add the cleaned content back to the dataframe as a column named 'content_cleaned'
data_news['content_cleaned'] = contents_cleaned
data_news.to_csv('../data/news_cleaned.csv', index=False)

In [None]:
# function to generate cleaned data and save
def generate_cleaned_data(name, data, ws, pos, ner):
    contents = data['content'].tolist()
    contents_cleaned = word_segmentation(contents, ws, pos, ner)
    data['content_cleaned'] = contents_cleaned
    data.to_csv(f'../data/{name}_cleaned.csv', index=False)

In [None]:
# generate cleaned data for ptt and dcard
data_ptt = pd.read_csv('../data/ptt_filtered_labeled.csv')
data_dcard = pd.read_csv('../data/dcard_filtered_labeled.csv')
data_to_clean = {'ptt': data_ptt, 'dcard': data_dcard}
for name, data in data_to_clean.items():
    generate_cleaned_data(name, data, ws, pos, ner)

## Part 2. Feature Engineering

In [2]:
# import packages 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy import sparse

# parameters 
n_features = {'ptt': 500, 'dcard': 500, 'news': 500, 'all': 500}

- 使用 `TfidfVectorizer` 建構詞彙的向量空間。
  - 考慮 1-gram 至 3-gram 的詞彙組合。
  - 篩選掉 document frequency 大於文件數 95% 與出現小於 2 次的詞彙。
- 對上述向量空間，我們使用 chi-square 選出前 k 名的最佳特徵作為模型的輸入。

In [3]:
def feature_extraction(source, data, n_features, days_ahead, chip = False): # chip 是籌碼數據
    '''
    - source: the source of the data
    - data: a pandas dataframe with a column named 'content_cleaned'
    - n_features: the number of features to select
    - days_ahead: the number of days ahead to predict
    '''
    # TF-IDF
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 3))
    X = vectorizer.fit_transform(data['content_cleaned'])
    # select the top 1000 features
    ch2 = SelectKBest(chi2, k=min(n_features[source],X.shape[1] - 30)) 
    X = ch2.fit_transform(X, data['label_day'+str(days_ahead)])
    if chip:
        # add the chip data
        # add the column of 'foreign_investor_surplus', 'investment_trust_surplus', 'dealer_surplus' to X
        chip_data = data[['foreign_investor_surplus', 'investment_trust_surplus', 'dealer_surplus']].values
        X = sparse.hstack((X, chip_data))
    # get the target variable
    y = data['label_day'+str(days_ahead)]
    return X, y

## Part 3. Model Training for All

In [4]:
# import packages
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from scipy.stats import randint
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import tqdm

# initialize all parameters
params = {
    'Naive Bayes': {},
    'Random Forest': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1],
        #'kernel': ['linear', 'rbf', 'poly'],
        'kernel': ['rbf'],
        'gamma': ['scale'],
    },
    'XGBoost': {
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5],
        'subsample': [0.8, 0.9, 1.0],
    }
}

採用模型：Naive Bayes, SVM, Random Forest, XGBoost 與 stacking model

- 將資料隨機切分成訓練資料集 (80%) 與測試資料集 (20%)，並對訓練資料集進行 5-fold cross validation
- 用 Grid Search 找到 Naive Bayes, SVM, Random Forest, XGBoost 的最佳參數
- 再分別執行最佳模型與四者的 stacking model，紀錄結果

In [5]:
def train(source, data, n_features, days_ahead, backtest=False, starting_year_month=None, training_months=None, chip = False):
    '''
    - source: the source of the data
    - data: a pandas dataframe with a column named 'content_cleaned'
    - n_features: the number of features to select
    - days_ahead: the number of days ahead to predict
    '''
    # Drop the rows where label = -1
    data = data[data['label_day'+str(days_ahead)] != -1]
    result = {}
    # feature extraction
    X, y = feature_extraction(source, data, n_features, days_ahead, chip = chip)
    # densify the sparse matrix
    X = X.toarray()
    # split the data into training and testing sets
    if backtest:
        print("Training on the data from", starting_year_month, "to", starting_year_month + pd.DateOffset(months=training_months))
        train_range = (data['date'] >= starting_year_month) & (data['date'] < starting_year_month + pd.DateOffset(months=training_months))
        print("Testing on the data from", starting_year_month + pd.DateOffset(months=training_months), "to", starting_year_month + pd.DateOffset(months=training_months+1))
        test_range = (data['date'] >= starting_year_month + pd.DateOffset(months=training_months)) & (data['date'] < starting_year_month + pd.DateOffset(months=training_months+1))
        X_train, X_test = X[train_range], X[test_range]
        y_train, y_test = y[train_range], y[test_range]
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print('Training set:', X_train.shape, y_train.shape)
    print('Testing set:', X_test.shape, y_test.shape)
    # kfold
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    models = {
        'Naive Bayes': GaussianNB(),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(),
        'XGBoost': XGBClassifier()
    }
    # search for the optimal parameters for each model by grid search
    best_models = {}
    for model_name, model in models.items():
        grid_search = GridSearchCV(model, params[model_name], cv=kfold, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
    # stack the models
    stack = StackingClassifier(estimators=[(name, model) for name, model in best_models.items()], final_estimator=LogisticRegression())
    stack.fit(X_train, y_train)
    # predict the test set and calculate the accuracy and confusion matrix, and save the results
    # best_models (respectively)
    for name, model in best_models.items():
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)
        result[name] = {'accuracy': accuracy, 'confusion': confusion, 'prediction': y_pred}
    # stack
    y_pred = stack.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    result['stack'] = {'accuracy': accuracy, 'confusion': confusion, 'prediction': y_pred}
    return result

- 任務一：針對不同內容種類 (ptt, dcard, 新聞) 選取不同的 d (天數)和 k (特徵數) 進行模型訓練，並比較結果。
- 把所有資料 (ptt, dcard, 新聞) 串接起來，選取 d = 3 和不同的 k (特徵數) 進行模型訓練，並比較結果。

In [6]:
# Split the data into training and testing sets
datas = {
    'all': pd.read_csv('../data/all_cleaned.csv'),
    'ptt': pd.read_csv('../data/ptt_cleaned.csv'),
    'dcard': pd.read_csv('../data/dcard_cleaned.csv'),
    'news': pd.read_csv('../data/news_cleaned.csv'),
}

In [7]:
# Train the models for each source media and get the results
results = {}
for name, data in datas.items():
    # drop nan for content_cleaned
    data = data.dropna(subset=['content_cleaned'])
    print(f'Training models for {name}')
    results[name] = {}
    for days_ahead in tqdm.tqdm(range(1, 6)):
        results[name][days_ahead] = train(name, data, n_features, days_ahead)

Training models for all


  0%|          | 0/5 [00:00<?, ?it/s]

Training set: (7374, 500) (7374,)
Testing set: (1844, 500) (1844,)


 20%|██        | 1/5 [06:13<24:52, 373.24s/it]

Training set: (7428, 500) (7428,)
Testing set: (1858, 500) (1858,)


 20%|██        | 1/5 [08:17<33:08, 497.25s/it]


KeyboardInterrupt: 

In [None]:
# Experiment with data['all'] for days ahead = 3 and number of features
n_features_all = [10, 50, 100, 500, 800, 1000, 2000, 4000]
results_all = {}
datas['all'] = datas['all'].dropna(subset=['content_cleaned'])
for num in n_features_all:
    n_features['all'] = num
    print(f'Training models for all with {num} features')
    results_all[num] = train('all', datas['all'], n_features, 3)

In [None]:
# create a dataframe to store the results
def create_results_df(results):
    results_df = pd.DataFrame(columns=['number of features', 'days_ahead', 'model', 'accuracy', 'confusion', 'prediction'])
    for num, result in results.items():
        for model, res in result.items():
            df_to_concat = pd.DataFrame({'number of features': num, 'days_ahead': 3, 'model': model, 'accuracy': res['accuracy'], 'confusion': str(res['confusion']), 'prediction': str(res['prediction'])}, index=[0])
            results_df = pd.concat([results_df, df_to_concat], ignore_index=True)
    return results_df

results_df = create_results_df(results_all)
results_df.sort_values(by='accuracy', ascending=False, inplace=True)
results_df.to_csv('../data/results/results_all.csv', index=False)

In [None]:
# flatten the results
results_flat = {}
for name, result in results.items():
    for days_ahead, models in result.items():
        for model_name, model_result in models.items():
            result_name = f'{name}_{days_ahead}_{model_name}'
            results_flat[result_name] = model_result

# save the flattened results
results_df = pd.DataFrame(results_flat).T
results_df.sort_values(by='accuracy', ascending=False, inplace=True)
results_df
results_df.to_csv(f'../data/prob2_sorted_f{n_features['news']}.csv') 

## Part 4. Backtesting

In [None]:
# import packages
from tqdm import tqdm

### 移動回測
對每一組四個月的資料，我們將前三個月視為訓練資料集、第四個月視為測試資料集，選取 d = 3 和不同的 k (特徵數) 進行模型訓練

In [None]:
# construct a list from 2022-03 to 2023-12
months = pd.date_range(start='2022-03-01', end='2023-12-01', freq='M')
months = months.strftime('%Y-%m').tolist()

# num_features
n_features['all'] = 500

# load the cleaned data
datas = {
    'all': pd.read_csv('../data/all_cleaned.csv'),
}

# test each month by train the data in the previous three months
results = {}
for name, data in datas.items():
    print(f'Training models for {name}')
    data = data.dropna(subset=['content_cleaned'])
    data['date'] = pd.to_datetime(data['date'])
    results[name] = {}
    for month in tqdm(months):
        starting_month = pd.to_datetime(month)
        training_months = 3  # timedelta of 3 month
        results[name][month] = train(name, data, n_features, 3, backtest=True, starting_year_month=starting_month, training_months=training_months)

對於每一個月，選擇表現最佳的模型並儲存其預測結果與模型準確率

In [None]:
# flatten the results and save
results_flat = {}
df_backtest_result = pd.DataFrame(columns=['starting month', 'model', 'accuracy', 'confusion', 'prediction'])
for name, result in results.items():
    for month, models in result.items():
        for model_name, model_result in models.items():
            row = {'starting month': month, 'model': model_name, 'accuracy': model_result['accuracy'], 'confusion': str(model_result['confusion']), 'prediction': str(model_result['prediction'])}
            df_backtest_result = pd.concat([df_backtest_result, pd.DataFrame(row, index=[0])], ignore_index=True)


# select the best model for each month
df_backtest_result['accuracy'] = df_backtest_result['accuracy'].astype(float)
best_models = df_backtest_result.groupby('starting month').apply(lambda x: x.loc[x['accuracy'].idxmax()]).reset_index(drop=True)
best_models.to_csv(f'../data/results/backtest_best_models_f{n_features["all"]}.csv', index=False)

## Part 5. Make prediction and compute the final result

In [None]:
# import packages
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [None]:
# load the prediction data
prediction_datas = {
    500: pd.read_csv('../data/results/backtest_best_models_f500.csv'),
    800: pd.read_csv('../data/results/backtest_best_models_f800.csv'),
    1000: pd.read_csv('../data/results/backtest_best_models_f1000.csv'),
    '500_chip': pd.read_csv('../data/results/backtest_best_models_f500_chip.csv'),
}

days_ahead = 3

# prepare the data
data_all = pd.read_csv('../data/all_cleaned.csv')
data_all = data_all.dropna(subset=['content_cleaned'])
data_all['date'] = pd.to_datetime(data_all['date'])
data_all = data_all[data_all['label_day'+str(days_ahead)] != -1]

### 對每天的文章用上一部分的最佳模型進行預測，並將預測結果輸出成 csv 檔

In [None]:
# construct a list from 2022-03 to 2023-12
months = pd.date_range(start='2022-03-01', end='2023-12-01', freq='M')
months = months.strftime('%Y-%m').tolist()

training_months = 3  # timedelta of 3 month

df_prediction_results = {num: pd.DataFrame()  for num in prediction_datas.keys()}

for num, data in prediction_datas.items():
    # print(f'Number of features: {num}')
    for month in months:
        predictions_str = data[data['starting month'] == month]['prediction'].iloc[0]
        predictions = predictions_str[1:-1].replace('\n', '').replace('.', '').split(' ')
        predictions = np.array(predictions).astype(int)
        month_to_predict = pd.to_datetime(month) + pd.DateOffset(months=training_months)
        data_to_predict = data_all[(data_all['date'] >= month_to_predict) & (data_all['date'] < month_to_predict + pd.DateOffset(months=1))]
        # compare the length of the predictions and the data to predict
        # print(f'Month: {month}, Number of predictions: {len(predictions)}, Number of data to predict: {len(data_to_predict)}') 
        # add the prediction as a new column to the data
        data_to_predict['prediction']= predictions
        # compute the number of positive and negative predictions for each day
        prediction_results = data_to_predict.groupby('date')['prediction'].value_counts().unstack().fillna(0)
        # add the results to the dataframe
        prediction_results = prediction_results.reset_index()
        df_prediction_results[num] = pd.concat([df_prediction_results[num], prediction_results], axis=0)
    # fill nan with 0
    df_prediction_results[num].fillna(0, inplace=True)
    # drop index
    df_prediction_results[num].reset_index(drop=True, inplace=True)

### 拿預測結果實際和真實結果比較，計算出最後的出手率、準確率

In [None]:
# compare with real label
df_label = data_all[['date', 'label_day'+str(days_ahead)]]
df_label['date'] = pd.to_datetime(df_label['date'])
# keep unique dates
df_label = df_label.drop_duplicates(subset=['date'])

In [None]:
# merge the label with the prediction results
for num, df in df_prediction_results.items():
    df = df.merge(df_label, on='date', how='left')
    # 0 > 1 -> final predict 0; 1 > 0 -> final predict 1; tie -> final predict -1
    df['final_prediction'] = np.where(df[0] > df[1], 0, np.where(df[1] > df[0], 1, -1))
    df['make_decision'] = np.where(df[0] > df[1], 1, np.where(df[1] > df[0], 1, 0))
    df['correct'] = np.where(df['final_prediction'] == df['label_day'+str(days_ahead)], 1, 0)
    df['month'] = df['date'].dt.to_period('M')
    df.to_csv(f'../data/results/predictions_by_date_f{num}.csv', index=False)

In [None]:
# 計算每個月的出手率和準確率
prediction_results = {
    500: pd.read_csv('../data/results/predictions_by_date_f500.csv'),
    800: pd.read_csv('../data/results/predictions_by_date_f800.csv'),
    1000: pd.read_csv('../data/results/predictions_by_date_f1000.csv'),
    '500_chip': pd.read_csv('../data/results/predictions_by_date_f500_chip.csv'),
}

df_final_rate = pd.DataFrame()
for num, df in prediction_results.items():
    # group by month
    df['date'] = pd.to_datetime(df['date'])
    df_grouped = df.groupby('month').agg({'correct': 'mean', 'make_decision':'mean'}).reset_index()
    # drop index
    df_grouped.reset_index(drop=True, inplace=True)
    df_grouped.to_csv(f'../data/results/final_rate_f{num}.csv', index=False)

## Part 6. 加入籌碼數據進行實驗

In [None]:
# import packages
from tqdm import tqdm

### 將籌碼數據加入特徵中，重新進行第二部分的模型訓練

In [None]:
data_all_with_chip = pd.read_csv('../data/all_cleaned_chip.csv')
results_with_chip = {}
n_features['all'] = 1000
data_all_with_chip = data_all_with_chip.dropna(subset=['content_cleaned'])
results_all = train('all', data_all_with_chip, n_features, 3, chip=True)

把上面的結果轉成 dataframe，並存成 csv 檔

In [None]:
results_df = pd.DataFrame(columns=['number of features', 'days_ahead', 'model', 'accuracy', 'confusion', 'prediction'])
for model_name, model_result in results_all.items():
    new_row = {'number of features': 1000, 'days_ahead': 3, 'model': model_name, 'accuracy': model_result['accuracy'], 'confusion': str(model_result['confusion']), 'prediction': str(model_result['prediction'])}
    results_df = pd.concat([results_df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

results_df.sort_values(by='accuracy', ascending=False, inplace=True)
results_df.to_csv('../data/results/results_all_with_chip.csv', index=False)

### 將籌碼數據加入特徵中，重新進行第三部分的移動回測

In [None]:
# construct a list from 2022-03 to 2023-12
months = pd.date_range(start='2022-03-01', end='2023-12-01', freq='M')
months = months.strftime('%Y-%m').tolist()

# num_features
n_features['all_chip'] = 500

# load the cleaned data
datas = {
    'all_chip': pd.read_csv('../data/all_cleaned_chip.csv'),
}

# test each month by train the data in the previous three months
results = {}
for name, data in datas.items():
    print(f'Training models for {name}')
    data = data.dropna(subset=['content_cleaned'])
    data['date'] = pd.to_datetime(data['date'])
    results[name] = {}
    for month in tqdm(months):
        starting_month = pd.to_datetime(month)
        training_months = 3  # timedelta of 3 month
        results[name][month] = train(name, data, n_features, 3, backtest=True, starting_year_month=starting_month, training_months=training_months, chip=True)

In [None]:
# flatten the results and save
results_flat = {}
df_backtest_result = pd.DataFrame(columns=['starting month', 'model', 'accuracy', 'confusion', 'prediction'])
for name, result in results.items():
    for month, models in result.items():
        for model_name, model_result in models.items():
            row = {'starting month': month, 'model': model_name, 'accuracy': model_result['accuracy'], 'confusion': str(model_result['confusion']), 'prediction': str(model_result['prediction'])}
            df_backtest_result = pd.concat([df_backtest_result, pd.DataFrame(row, index=[0])], ignore_index=True)


# select the best model for each month
df_backtest_result['accuracy'] = df_backtest_result['accuracy'].astype(float)
best_models = df_backtest_result.groupby('starting month').apply(lambda x: x.loc[x['accuracy'].idxmax()]).reset_index(drop=True)
best_models.to_csv(f'../data/results/backtest_best_models_f{n_features["all_chip"]}_chip.csv', index=False)

## Playground

In [None]:
data = pd.read_csv('../data/news_filtered_merged.csv')
contents = data['content'].tolist()
contents_cleaned = word_segmentation(contents[:100], ws, pos, ner)
print(contents_cleaned[0])

In [None]:
# Create all_cleaned.csv
data_all = pd.DataFrame()
for name, data in datas.items():
    # concatenate the data
    data_all = pd.concat([data_all, data])

data_all.to_csv('../data/all_cleaned.csv', index=False)

In [None]:
df_all = pd.read_csv('../data/all_cleaned.csv')
df_chip = pd.read_csv('../data/籌碼數據-2年_by_date_standardized.csv')
df_chip['date'] = pd.to_datetime(df_chip['date'])
df_all['date'] = pd.to_datetime(df_all['date'])

# merge two dataframes on date
df_all_chip = df_all.merge(df_chip, on='date', how='left')
df_all_chip.head()

# save the merged data
df_all_chip.to_csv('../data/all_cleaned_chip.csv', index=False)

In [None]:
df_all_chip = pd.read_csv('../data/all_cleaned_chip.csv')
# drop nan rows with cleaned content
df_all_chip = df_all_chip.dropna(subset=['content_cleaned'])
# fill remaining nan with 0
df_all_chip.fillna(0, inplace=True)
df_all_chip.to_csv('../data/all_cleaned_chip.csv', index=False)


In [None]:
df_p2_100 = pd.read_csv('../data/results/prob2_sorted_f100.csv')
df_p2_100['num_features'] = 100
df_p2_200 = pd.read_csv('../data/results/prob2_sorted_f200.csv')
df_p2_200['num_features'] = 200
df_p2_300 = pd.read_csv('../data/results/prob2_sorted_f300.csv')
df_p2_300['num_features'] = 300
df_p2_400 = pd.read_csv('../data/results/prob2_sorted_f400.csv')
df_p2_400['num_features'] = 400
df_p2_500 = pd.read_csv('../data/results/prob2_sorted_f500.csv')
df_p2_500['num_features'] = 500
df_p2_600 = pd.read_csv('../data/results/prob2_sorted_f600.csv')
df_p2_600['num_features'] = 600
df_p2_700 = pd.read_csv('../data/results/prob2_sorted_f700.csv')
df_p2_700['num_features'] = 700

In [None]:
# concatenate the dataframes
df_p2 = pd.concat([df_p2_100, df_p2_200, df_p2_300, df_p2_400, df_p2_500, df_p2_600, df_p2_700], ignore_index=True)

In [None]:
#sort by accuracy
df_p2.sort_values(by='accuracy', ascending=False, inplace=True)
df_p2.to_csv('../data/results/prob2_sorted.csv', index=False)

In [None]:
import pandas as pd
import numpy as np

df_predictions_by_date = pd.read_csv('../data/results/predictions_by_date_f500_chip.csv')
# compare final_prediction with label_day3, get confusion matrix
confusion = pd.crosstab(df_predictions_by_date['final_prediction'], df_predictions_by_date['label_day3'])
print(confusion)


In [None]:
# calculate the make_decision rate
make_decision_rate = np.mean(df_predictions_by_date['make_decision'])
print(make_decision_rate)

In [None]:
# drop the rows where decision = -1
df_predictions_by_date = df_predictions_by_date[df_predictions_by_date['final_prediction'] != -1]

# calculate the accuracy
accuracy = np.mean(df_predictions_by_date['correct'])
print(accuracy)



In [None]:
# plot the accuracy and make_decision rate for each month

import matplotlib.pyplot as plt
import seaborn as sns

df_predictions_by_date = pd.read_csv('../data/results/predictions_by_date_f500_chip.csv')
df_predictions_by_date['month'] = pd.to_datetime(df_predictions_by_date['month'])
# we shouldn't consider the case where final_prediction = -1 when calculating the accuracy
df_predictions_by_date_2 = df_predictions_by_date[df_predictions_by_date['final_prediction'] != -1]
# group by month
df_grouped = df_predictions_by_date_2.groupby('month').agg({'correct': 'mean'}).reset_index()
df_grouped['make_decision'] = df_predictions_by_date.groupby('month').agg({'make_decision': 'mean'}).reset_index()['make_decision']
# plot the accuracy and make_decision rate
plt.figure(figsize=(10, 6))
plt.plot(df_grouped['month'], df_grouped['correct'], label='Accuracy')
plt.plot(df_grouped['month'], df_grouped['make_decision'], label='Make Decision Rate')
plt.xlabel('Month')
plt.ylabel('Rate')
plt.title('Accuracy and Make Decision Rate by Month')
plt.legend()
# make the text size bigger
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45)
plt.show()


