# Predicting Apple Stock trend with Random Forest Classifier Model

Random Forest Classifier Algorithm is used to identify trend in Apple stock. Using the `APPL.csv` file.

In [2]:
# Initial imports

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


## Loading and Preprocessing Data

Load the `APPL.csv` in a pandas DataFrame called `df`

In [3]:
# Loading data
file_path = Path("AAPL.csv")
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,ts_polarity,twitter_volume
0,2016-01-04,25.65,26.34,25.5,26.34,24.44,270597600,0.070389,1133.0
1,2016-01-05,26.44,26.46,25.6,25.68,23.83,223164000,0.133635,1430.0
2,2016-01-06,25.14,25.59,24.97,25.17,23.36,273829600,0.072042,1949.0
3,2016-01-07,24.67,25.03,24.11,24.11,22.38,324377600,0.074369,2289.0
4,2016-01-08,24.64,24.78,24.19,24.24,22.5,283192000,0.051595,2235.0


In [4]:
# Dataframe with Date, Adj close,  Volume, ts_polarity, twitter_volume of APPL
appl_df = df[["Date", "Adj Close", "Volume", "ts_polarity", "twitter_volume"]]
appl_df.head()

Unnamed: 0,Date,Adj Close,Volume,ts_polarity,twitter_volume
0,2016-01-04,24.44,270597600,0.070389,1133.0
1,2016-01-05,23.83,223164000,0.133635,1430.0
2,2016-01-06,23.36,273829600,0.072042,1949.0
3,2016-01-07,22.38,324377600,0.074369,2289.0
4,2016-01-08,22.5,283192000,0.051595,2235.0


In [5]:
# Setting Index as Date
appl_df = appl_df.dropna()
appl_df.set_index("Date", inplace = True)
appl_df.tail()

Unnamed: 0_level_0,Adj Close,Volume,ts_polarity,twitter_volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-08-26,51.12,104174400,0.07234,888.0
2019-08-27,50.54,103493200,0.117541,962.0
2019-08-28,50.88,63755200,0.061477,895.0
2019-08-29,51.74,83962000,0.05646,1083.0
2019-08-30,51.67,84573600,0.106096,1005.0


In [6]:
# Sorting ts_polarity into Positive, Negative and Neutral sentiment

sentiment = []
for score in appl_df['ts_polarity']:
    if score >= 0.05 :
          sentiment.append("Positive")
    elif score <= - 0.05 :
          sentiment.append("Negative")
    else :
        sentiment.append("Neutral")

appl_df["Sentiment"] = sentiment
appl_df.head()

Unnamed: 0_level_0,Adj Close,Volume,ts_polarity,twitter_volume,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-04,24.44,270597600,0.070389,1133.0,Positive
2016-01-05,23.83,223164000,0.133635,1430.0,Positive
2016-01-06,23.36,273829600,0.072042,1949.0,Positive
2016-01-07,22.38,324377600,0.074369,2289.0,Positive
2016-01-08,22.5,283192000,0.051595,2235.0,Positive


In [7]:
# Sentiment Count
appl_df['Sentiment'].value_counts()

Positive    785
Neutral     134
Negative      3
Name: Sentiment, dtype: int64

In [8]:
#Stock Trend based on difference between current price to previous day price and coverting them to '0' as fall and '1' as rise in stock price
appl_df['Price Diff'] = appl_df['Adj Close'].diff()
appl_df.dropna(inplace = True)
appl_df['Trend'] = np.where(
    appl_df['Price Diff'] > 0 , 1, 0)

appl_df.head()

Unnamed: 0_level_0,Adj Close,Volume,ts_polarity,twitter_volume,Sentiment,Price Diff,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-05,23.83,223164000,0.133635,1430.0,Positive,-0.61,0
2016-01-06,23.36,273829600,0.072042,1949.0,Positive,-0.47,0
2016-01-07,22.38,324377600,0.074369,2289.0,Positive,-0.98,0
2016-01-08,22.5,283192000,0.051595,2235.0,Positive,0.12,1
2016-01-11,22.86,198957600,0.019443,1222.0,Neutral,0.36,1


In [9]:
# Binary encoding Sentiment column
appl_trend = appl_df[["Adj Close", "Volume", 'twitter_volume', "Sentiment", "Trend"]]
appl_trend = pd.get_dummies(appl_trend, columns=["Sentiment"])
appl_trend.head()

Unnamed: 0_level_0,Adj Close,Volume,twitter_volume,Trend,Sentiment_Negative,Sentiment_Neutral,Sentiment_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-05,23.83,223164000,1430.0,0,0,0,1
2016-01-06,23.36,273829600,1949.0,0,0,0,1
2016-01-07,22.38,324377600,2289.0,0,0,0,1
2016-01-08,22.5,283192000,2235.0,1,0,0,1
2016-01-11,22.86,198957600,1222.0,1,0,1,0


In [10]:
# Defining features set
X = appl_trend.copy()
X.drop("Trend", axis=1, inplace=True)
X.head()


Unnamed: 0_level_0,Adj Close,Volume,twitter_volume,Sentiment_Negative,Sentiment_Neutral,Sentiment_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-05,23.83,223164000,1430.0,0,0,1
2016-01-06,23.36,273829600,1949.0,0,0,1
2016-01-07,22.38,324377600,2289.0,0,0,1
2016-01-08,22.5,283192000,2235.0,0,0,1
2016-01-11,22.86,198957600,1222.0,0,1,0


In [11]:
# Defining target vector
y = appl_trend["Trend"].values.reshape(-1, 1)
y[:5]


array([[0],
       [0],
       [0],
       [1],
       [1]])

Split the data into training and testing sets.

In [12]:
# Splitting into Train and Test sets
split = int(0.7 * len(X))

X_train = X[: split]
X_test = X[split:]

y_train = y[: split]
y_test = y[split:]

In [13]:
# Using StandardScaler to scale features data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ## Create a Random Forest Classifier Model


In [14]:
# Create RFClassifier model
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())


## Making Predictions Using the Random Forest Classifier Model

In [15]:
# Make predictions
predictions = rf_model.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).head(20)

# Generate accuracy score for predictions using y_test
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.5306859205776173


## Model Evaluation

Evaluating model's results, using `sklearn` to calculate the confusion matrix and to generate the classification report.

In [16]:
# Generating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,37,90
Actual 1,40,110


In [17]:
# Generating classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           0       0.48      0.29      0.36       127
           1       0.55      0.73      0.63       150

    accuracy                           0.53       277
   macro avg       0.52      0.51      0.50       277
weighted avg       0.52      0.53      0.51       277

