In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
# Load the combined dataset
data = pd.read_csv('combined_stock_sentiment_data.csv')

In [6]:
# Display the first few rows of the dataset
print(data.head())

         Date        Open        High         Low       Close   Adj Close  \
0  2024-09-18  217.550003  222.710007  217.539993  220.690002  220.690002   
1  2024-09-19  224.990005  229.820007  224.630005  228.869995  228.869995   
2  2024-09-20  229.970001  233.089996  227.619995  228.199997  228.199997   
3  2024-09-23  227.339996  229.449997  225.809998  226.470001  226.470001   
4  2024-09-24  228.649994  229.350006  225.729996  227.369995  227.369995   

      Volume title  sentiment  
0   59894900   NaN     0.3818  
1   66781300   NaN     0.3818  
2  318679900   NaN     0.3818  
3   54146000   NaN     0.3818  
4   43556100   NaN     0.3818  


In [7]:
# Prepare features (X) and target (y)
X = data[['Open', 'High', 'Low', 'Volume', 'sentiment']].fillna(0)
y = data['Close']  # Target variable

In [8]:
# Feature scaling (Standardizing)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [10]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [11]:
# Train the model
model.fit(X_train, y_train)

In [12]:
# Cross-validation to validate model performance
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-Validation R^2 Scores: {cv_scores}")
print(f"Mean Cross-Validation R^2: {cv_scores.mean()}")

Cross-Validation R^2 Scores: [0.94097918 0.96731019 0.77233728 0.98857974 0.91314919]
Mean Cross-Validation R^2: 0.9164711172755284


In [13]:
# Make predictions
y_pred = model.predict(X_test)

In [14]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [15]:
# Print performance
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.4641472491061078
R^2 Score: 0.9602400594928939


In [16]:
# Feature Importance
importances = model.feature_importances_
feature_names = ['Open', 'High', 'Low', 'Volume', 'Sentiment']

In [17]:
# Display feature importances
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances.sort_values(by='Importance', ascending=False, inplace=True)
print("\nFeature Importances:\n", feature_importances)


Feature Importances:
      Feature    Importance
2        Low  3.999187e-01
0       Open  3.704356e-01
1       High  2.170998e-01
3     Volume  1.254585e-02
4  Sentiment  1.636477e-12
