# Commodity Price Movement Detection Based on News and Events
This notebook demonstrates how to predict commodity price movements using news articles and events. The following steps are performed:
- Text scraping (or loading dataset)
- Text preprocessing (NLP)
- Sentiment analysis
- Classification using machine learning models like Logistic Regression, Random Forest, and XGBoost
- Feature importance using SHAP or LIME


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


### Load News Dataset and Preprocess

In [None]:
# Load the dataset (assuming a CSV with 'news_text' and 'price_movement' columns)
df = pd.read_csv('commodity_news.csv')

# Preview the data
df.head()

### Text Preprocessing

In [None]:
# Preprocessing text using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df['news_text'])

# Target variable (price movement: 1 for up, 0 for down)
y = df['price_movement']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Sentiment Analysis

In [None]:
# Sentiment Analysis using VADER
analyzer = SentimentIntensityAnalyzer()

# Apply sentiment analysis to each news article
df['sentiment'] = df['news_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Add sentiment as a feature
X_sentiment = pd.concat([pd.DataFrame(X.toarray()), df[['sentiment']]], axis=1)

# Split sentiment-enhanced data
X_train_sent, X_test_sent, y_train, y_test = train_test_split(X_sentiment, y, test_size=0.3, random_state=42)

### Logistic Regression

In [None]:
# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_sent, y_train)
y_pred_log = log_reg.predict(X_test_sent)
print("Logistic Regression Performance:")
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_log))

### Random Forest

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_sent, y_train)
y_pred_rf = rf.predict(X_test_sent)
print("\nRandom Forest Performance:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_rf))

### XGBoost

In [None]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_sent, y_train)
y_pred_xgb = xgb.predict(X_test_sent)
print("\nXGBoost Performance:")
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_xgb))

### SHAP Analysis for Feature Importance

In [None]:
# SHAP Analysis for XGBoost
print("\nRunning SHAP Analysis for XGBoost...")
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test_sent)

# SHAP summary plot
shap.summary_plot(shap_values, X_test_sent, feature_names=tfidf.get_feature_names_out())

### LIME Explanation

In [None]:
# LIME Explanations
print("\nRunning LIME Explanations...")
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train_sent.values, feature_names=list(tfidf.get_feature_names_out()) + ['sentiment'], class_names=['Down', 'Up'], discretize_continuous=True)
lime_exp = lime_explainer.explain_instance(X_test_sent.values[0], xgb.predict_proba)
lime_exp.show_in_notebook(show_table=True)