### Gradient Boosting

In [1]:
import pandas as pd
X = pd.read_pickle('/Volumes/MacAirSSD/Earnings/data/X.pkl')
y = pd.read_pickle('/Volumes/MacAirSSD/Earnings/data/y.pkl') 

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

In [5]:
# train and split 
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:

# Create a Gradient Boosting Classifier model
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Fit the model to the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gb_classifier.predict(X_test)

# classification report
gb_report = classification_report(y_test, y_pred)
print(f'Classification GB report: {gb_report}')


Classification GB report:               precision    recall  f1-score   support

           0       0.93      0.80      0.86     12333
           1       0.85      0.95      0.90     14972

    accuracy                           0.88     27305
   macro avg       0.89      0.88      0.88     27305
weighted avg       0.89      0.88      0.88     27305



In [8]:
import random
random_state_list = sorted([random.randint(100, 200) for _ in range(3)])


In [10]:
for random_state in random_state_list:
    # Create a Gradient Boosting Classifier model
    gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, 
                                               random_state=random_state)

    # Fit the model to the training data
    gb_classifier.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = gb_classifier.predict(X_test)

    # classification report
    gb_report = classification_report(y_test, y_pred)
    print(f'Classification GB report: \n{gb_report} for random state: {random_state}')

Classification GB report: 
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     12333
           1       0.85      0.95      0.90     14972

    accuracy                           0.88     27305
   macro avg       0.89      0.88      0.88     27305
weighted avg       0.89      0.88      0.88     27305
 for random state: 144
Classification GB report: 
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     12333
           1       0.85      0.95      0.90     14972

    accuracy                           0.88     27305
   macro avg       0.89      0.88      0.88     27305
weighted avg       0.89      0.88      0.88     27305
 for random state: 188
Classification GB report: 
              precision    recall  f1-score   support

           0       0.93      0.80      0.86     12333
           1       0.85      0.95      0.90     14972

    accuracy                           0.88     27305
   

In [11]:
from sklearn.model_selection import cross_val_score, KFold

# Create a Gradient Boosting Classifier model
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, 
                                           random_state=42)

# Define the number of folds (e.g., 5-fold cross-validation)
k = 5

# Create a KFold object for splitting the data
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Perform K-Fold Cross-Validation
scores = cross_val_score(gb_classifier, X, y, cv=kf)

# Print the cross-validation scores
for fold, score in enumerate(scores, start=1):
    print(f"Fold {fold}: Accuracy = {score:.2f}")

# Calculate and print the mean accuracy across all folds
mean_accuracy = scores.mean()
print(f"Mean Accuracy: {mean_accuracy:.2f}")

Fold 1: Accuracy = 0.88
Fold 2: Accuracy = 0.89
Fold 3: Accuracy = 0.88
Fold 4: Accuracy = 0.88
Fold 5: Accuracy = 0.88
Mean Accuracy: 0.88


In [96]:
unseen = pd.read_pickle('../../data/earnings_all.pkl').iloc[:, 6:]
unseen['label'] = (unseen.sma50_close > unseen.sma200_close).astype(int)
unseen

Unnamed: 0,close,volume,sma50_close,sma200_close,sma50_vol,sma200_vol,Topic_1,Topic_2,Topic_3,Topic_4,...,Topic_8,Topic_9,Topic_10,LM_Positive,LM_Negative,LM_Uncertainty,neg,neu,pos,label
0,1.010000,269000.0,1.01784,1.048025,148050.0,268928.0,0.030310,0.000254,0.000254,0.047771,...,0.000254,0.000254,0.000254,15,19,9,0.048,0.787,0.165,0
1,56.540001,1492700.0,51.31960,63.367150,257242.0,275629.0,0.000110,0.186299,0.040406,0.000110,...,0.000301,0.010848,0.000110,60,36,9,0.041,0.801,0.159,0
2,3.900000,83400.0,3.97932,3.639174,18044.0,21573.0,0.012019,0.000269,0.985829,0.000269,...,0.000269,0.000269,0.000269,19,10,4,0.066,0.735,0.199,1
3,0.756000,2400.0,0.90028,1.239935,8786.0,10768.5,0.000071,0.121839,0.080557,0.000071,...,0.255873,0.000071,0.029196,110,166,38,0.074,0.737,0.189,0
4,3.300000,0.0,2.92980,2.804179,12442.0,6383.5,0.000164,0.000164,0.084116,0.000172,...,0.914567,0.000164,0.000164,46,71,8,0.115,0.664,0.220,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,38.500000,747400.0,34.42660,30.863250,389876.0,366808.5,0.000188,0.091141,0.715725,0.000188,...,0.000188,0.000188,0.000188,28,24,12,0.047,0.670,0.283,1
3996,6.340000,1444200.0,6.39980,6.517150,1356254.0,1575219.0,0.000112,0.097691,0.874183,0.000112,...,0.000112,0.000112,0.000112,42,62,31,0.032,0.809,0.159,0
3997,66.180000,7128600.0,66.63600,72.481550,3406618.0,3557121.5,0.000095,0.000095,0.115883,0.000095,...,0.000095,0.000095,0.000095,57,53,43,0.029,0.856,0.115,0
3998,11.590000,4440100.0,9.27560,9.460650,745220.0,653612.5,0.000466,0.000466,0.046306,0.000466,...,0.000466,0.000466,0.949969,9,20,4,0.037,0.758,0.205,0


In [98]:
unseen = unseen.dropna()
X1 = unseen.drop('label', axis=1)
X1

Unnamed: 0,close,volume,sma50_close,sma200_close,sma50_vol,sma200_vol,Topic_1,Topic_2,Topic_3,Topic_4,...,Topic_7,Topic_8,Topic_9,Topic_10,LM_Positive,LM_Negative,LM_Uncertainty,neg,neu,pos
0,1.010000,269000.0,1.01784,1.048025,148050.0,268928.0,0.030310,0.000254,0.000254,0.047771,...,0.000254,0.000254,0.000254,0.000254,15,19,9,0.048,0.787,0.165
1,56.540001,1492700.0,51.31960,63.367150,257242.0,275629.0,0.000110,0.186299,0.040406,0.000110,...,0.000110,0.000301,0.010848,0.000110,60,36,9,0.041,0.801,0.159
2,3.900000,83400.0,3.97932,3.639174,18044.0,21573.0,0.012019,0.000269,0.985829,0.000269,...,0.000269,0.000269,0.000269,0.000269,19,10,4,0.066,0.735,0.199
3,0.756000,2400.0,0.90028,1.239935,8786.0,10768.5,0.000071,0.121839,0.080557,0.000071,...,0.000071,0.255873,0.000071,0.029196,110,166,38,0.074,0.737,0.189
4,3.300000,0.0,2.92980,2.804179,12442.0,6383.5,0.000164,0.000164,0.084116,0.000172,...,0.000164,0.914567,0.000164,0.000164,46,71,8,0.115,0.664,0.220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,38.500000,747400.0,34.42660,30.863250,389876.0,366808.5,0.000188,0.091141,0.715725,0.000188,...,0.000188,0.000188,0.000188,0.000188,28,24,12,0.047,0.670,0.283
3996,6.340000,1444200.0,6.39980,6.517150,1356254.0,1575219.0,0.000112,0.097691,0.874183,0.000112,...,0.000112,0.000112,0.000112,0.000112,42,62,31,0.032,0.809,0.159
3997,66.180000,7128600.0,66.63600,72.481550,3406618.0,3557121.5,0.000095,0.000095,0.115883,0.000095,...,0.878591,0.000095,0.000095,0.000095,57,53,43,0.029,0.856,0.115
3998,11.590000,4440100.0,9.27560,9.460650,745220.0,653612.5,0.000466,0.000466,0.046306,0.000466,...,0.000466,0.000466,0.000466,0.949969,9,20,4,0.037,0.758,0.205


In [34]:
# Create a Gradient Boosting Classifier model
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Fit the model to the training data
gb_classifier.fit(X_train, y_train)


In [35]:
# Make predictions on the test data
y_pred = gb_classifier.predict(X_test)

# classification report
gb_report = classification_report(y_test, y_pred)
print(f'Classification GB report: {gb_report}')

Classification GB report:               precision    recall  f1-score   support

           0       0.93      0.80      0.86     12333
           1       0.85      0.95      0.90     14972

    accuracy                           0.88     27305
   macro avg       0.89      0.88      0.88     27305
weighted avg       0.89      0.88      0.88     27305



In [99]:
X.columns


Index(['close', 'volume', 'sma50_close', 'sma200_close', 'sma50_vol',
       'sma200_vol', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5',
       'Topic_6', 'Topic_7', 'Topic_8', 'Topic_9', 'Topic_10', 'LM_Positive',
       'LM_Negative', 'LM_Uncertainty', 'neg', 'neu', 'pos'],
      dtype='object')

In [100]:
X1.columns

Index(['close', 'volume', 'sma50_close', 'sma200_close', 'sma50_vol',
       'sma200_vol', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5',
       'Topic_6', 'Topic_7', 'Topic_8', 'Topic_9', 'Topic_10', 'LM_Positive',
       'LM_Negative', 'LM_Uncertainty', 'neg', 'neu', 'pos'],
      dtype='object')

In [103]:
y_unseen = unseen['label']


In [105]:
# Once the classifier is fitted, you can use it to make predictions on new data
unseen_pred = gb_classifier.predict(X1)  # X_unseen_data should contain your new, unseen data

# Now you can use y_pred for further analysis or evaluation

In [107]:
gb_report_on_unseen = classification_report(y_unseen, unseen_pred)
print(f'Classification report on unseen data: \n{gb_report_on_unseen}')

Classification report on unseen data: 
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      1927
           1       0.86      0.90      0.88      2016

    accuracy                           0.88      3943
   macro avg       0.88      0.87      0.88      3943
weighted avg       0.88      0.88      0.88      3943

