<a href="https://colab.research.google.com/github/YasiruMM/Medicine-Prediction-Grp-22/blob/Model_Training_for_Demand_Prediction/XGBooster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/My Drive/DSGP/MediTrackData.csv'
df = pd.read_csv(file_path)

Mounted at /content/drive


In [None]:
# Prepare feature (X) and target (Y) variables
X = df[['Disease Category', 'Drug Category', 'Drug Name', 'Dosage',
        'Retail Price', 'Purchase Price', 'Sales', 'Date',
        'Mean Sales', 'CV', 'Buffer Percentage', 'Buffer Stock']]
Y = df[['Sales']]

In [None]:
# Save 'Drug Name'& Disease Category before encoding
drug_names = X[['Drug Name']]
disease_categories = X[['Disease Category']]

In [None]:
# Convert 'Date' feature to numerical month values
X['Date'] = X['Date'].astype(float)

# Convert categorical variables into numerical labels (ONE-HOT-Encoding)
X = pd.get_dummies(X, columns=['Disease Category', 'Drug Category', 'Drug Name', 'Dosage'])

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
# Convert to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [None]:
# Set the model parameters
params = {
    'objective': 'reg:squarederror',  # Regression task
    'eval_metric': 'rmse',            # Evaluation metric: root mean square error
    'max_depth': 6,                   # Maximum tree depth
    'learning_rate': 0.1,             # Learning rate
    'colsample_bytree': 0.8,          # Column sampling rate
    'subsample': 0.8                  # Subsample rate
}

In [None]:
# Train the model by getting involved with cross validation
num_boost_round = 100  # Number of boosting rounds
cv_results = xgb.cv(
    params=params,
    dtrain=xgb.DMatrix(X, label=Y),  # Use the entire dataset for CV
    num_boost_round=num_boost_round,
    nfold=5,  # Number of folds in cross-validation
    early_stopping_rounds=10,  # Stop early if the performance doesn't improve
    as_pandas=True,  # Returns the results as a pandas DataFrame
    verbose_eval=True  # Print progress
)
# Display cross-validation results
print(cv_results)


[0]	train-rmse:3136.89725+27.73960	test-rmse:3135.70188+112.46873
[1]	train-rmse:2828.10582+25.36101	test-rmse:2827.68646+101.27066
[2]	train-rmse:2549.84270+23.20631	test-rmse:2549.52701+91.79752
[3]	train-rmse:2298.80239+20.93807	test-rmse:2298.19217+83.94017
[4]	train-rmse:2072.73752+18.58261	test-rmse:2072.64427+76.71684
[5]	train-rmse:1869.39812+16.79673	test-rmse:1869.70132+69.80292
[6]	train-rmse:1685.79820+15.04311	test-rmse:1685.60407+63.61097
[7]	train-rmse:1520.31248+13.24631	test-rmse:1520.09773+58.28670
[8]	train-rmse:1371.11654+11.90280	test-rmse:1371.68162+53.31157
[9]	train-rmse:1236.70731+10.62813	test-rmse:1237.15908+49.08387
[10]	train-rmse:1115.72936+9.63982	test-rmse:1116.07666+44.32935
[11]	train-rmse:1013.38907+8.31440	test-rmse:1014.53767+40.28104
[12]	train-rmse:914.61465+7.59996	test-rmse:915.86753+36.94494
[13]	train-rmse:863.44551+5.78047	test-rmse:866.41888+34.24561
[14]	train-rmse:788.04024+5.05751	test-rmse:792.10858+30.52029
[15]	train-rmse:718.42634+3.9

In [None]:
#getting the best boosting round based on the evaluation metric
best_round = cv_results['test-rmse-mean'].idxmin()  # finding the best round with the minimum RMSE
print(f"Best boosting round: {best_round}")


Best boosting round: 85


In [None]:
# training the final model by using the best boosting round
model = xgb.train(
    params=params,
    dtrain=xgb.DMatrix(X, label=Y),
    num_boost_round=best_round  # Train with the optimal number of boosting rounds
)


In [None]:
# saving test dataset for future predictions
X_test_future=X_test.copy()




In [None]:
import numpy as np
# loop XGBooster for 6 months predictions
future_results=pd.DataFrame()
for month in range(1,7):
  # extract Months & Years seperately
  X_test_future['Month']=X_test_future['Date'].astype(int)
  X_test_future['Year']=(X_test_future['Date']-X_test_future['Month'])*100

  # Handle Month and Year updation
  X_test_future['Month']=X_test_future['Month']+1
  X_test_future['Year']=X_test_future['Year']+ (X_test_future['Month']//12) # handle Year overflow situations
  X_test_future['Month']=X_test_future['Month']%12 # Reset Months for exceeding 12 situations

  X_test_future['Date']=X_test_future['Month'] + X_test_future['Year']/100.0    # Reconstruction of Date Feature.
  X_test_future=X_test_future.drop(columns=['Month','Year']) # Drop Month and Year from Original dataset.
  dtest_future = xgb.DMatrix(X_test_future)
  predictions_future = np.round(model.predict(dtest_future)) # prediction for next month
  future_Month_Results=pd.DataFrame({
          'Disease Category': disease_categories.iloc[X_test_future.index]['Disease Category'],
          'Drug Name': drug_names.iloc[X_test_future.index]['Drug Name'],
          'Month': [month] * len(predictions_future),
          'Predicted Sales': predictions_future

      })

     # Concatenate instead of append
  future_results = pd.concat([future_results, future_Month_Results], ignore_index=True)




In [None]:
print(future_results.head())

  Disease Category         Drug Name  Month  Predicted Sales
0      Cholesterol     GLIDABET 80MG      1             92.0
1      Cholesterol    GLUCOZIDE 80MG      1           7869.0
2      Cholesterol  DIAZIDE TAB 80MG      1             30.0
3      Cholesterol       GLIVIC 40MG      1            122.0
4   Cardiovascular      LOWPRES 50MG      1            962.0


# R^2 and Coefficient of Variance Determination

In [None]:
from sklearn.metrics import r2_score

# Calculate R² score
r2 = r2_score(y_test, predictions_future)
print(f"R²: {r2}")

R²: 0.9990503787994385


In [None]:
import numpy as np

# Calculate Coefficient of Variation (CV)
cv =( np.std(predictions_future) / np.mean(predictions_future) )* 100
print(f"Coefficient of Variation (CV): {cv}%")

Coefficient of Variation (CV): 199.5741844177246%


In [None]:
# # Make predictions
# predictions = model.predict(dtest)

# # Create a DataFrame with the predictions and drug names
# results = pd.DataFrame({
#     'Drug Name': drug_names.iloc[X_test.index]['Drug Name'],  # Map back to the original Drug Name
#     'Disease Category': disease_categories.iloc[X_test.index]['Disease Category'],  # Map back to the original Disease Category
#     'Predicted Sales': predictions
# })

In [None]:
# Pivot the DataFrame so each drug has predictions in a single row
pivot_results = future_results.pivot_table(
    index=['Disease Category', 'Drug Name'],
    columns='Month',
    values='Predicted Sales'
).reset_index()

#Enhancing readability by renaming for all Predictions
pivot_results.columns = ['Disease Category', 'Drug Name', 'Prediction 1', 'Prediction 2', 'Prediction 3', 'Prediction 4', 'Prediction 5', 'Prediction 6']


In [None]:
from google.colab import files
pivot_results.to_csv('Medicine predictions XGBooster.csv', index=False)
files.download('Medicine predictions XGBooster.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>