<a href="https://colab.research.google.com/github/YasiruMM/Medicine-Prediction-Grp-22/blob/Model_Training_for_Demand_Prediction/XGBooster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from google.colab import drive

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/My Drive/DSGP/MediTrackData.csv'
df = pd.read_csv(file_path)

Mounted at /content/drive


In [3]:
# Prepare feature (X) and target (Y) variables
X = df[['Disease Category', 'Drug Category', 'Drug Name', 'Dosage',
        'Retail Price', 'Purchase Price', 'Sales', 'Date',
        'Mean Sales', 'CV', 'Buffer Percentage', 'Buffer Stock']]
Y = df[['Sales']]

In [4]:
# Save 'Drug Name'& Disease Category before encoding
drug_names = X[['Drug Name']]
disease_categories = X[['Disease Category']]

In [5]:
# Convert 'Date' feature to numerical month values
X['Date'] = X['Date'].astype(float)

# Convert categorical variables into numerical labels (ONE-HOT-Encoding)
X = pd.get_dummies(X, columns=['Disease Category', 'Drug Category', 'Drug Name', 'Dosage'])

In [6]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [7]:
# Convert to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [8]:
# Set the model parameters
params = {
    'objective': 'reg:squarederror',  # Regression task
    'eval_metric': 'rmse',            # Evaluation metric: root mean square error
    'max_depth': 6,                   # Maximum tree depth
    'learning_rate': 0.1,             # Learning rate
    'colsample_bytree': 0.8,          # Column sampling rate
    'subsample': 0.8                  # Subsample rate
}

In [9]:
# Train the model
num_boost_round = 100  # Number of boosting rounds
model = xgb.train(params, dtrain, num_boost_round=num_boost_round)

In [10]:
# saving test dataset for future predictions
X_test_future=X_test.copy()




In [14]:
import numpy as np
# loop XGBooster for 6 months predictions
future_results=pd.DataFrame()
for month in range(1,7):
  # extract Months & Years seperately
  X_test_future['Month']=X_test_future['Date'].astype(int)
  X_test_future['Year']=(X_test_future['Date']-X_test_future['Month'])*100

  # Handle Month and Year updation
  X_test_future['Month']=X_test_future['Month']+1
  X_test_future['Year']=X_test_future['Year']+ (X_test_future['Month']//12) # handle Year overflow situations
  X_test_future['Month']=X_test_future['Month']%12 # Reset Months for exceeding 12 situations

  X_test_future['Date']=X_test_future['Month'] + X_test_future['Year']/100.0    # Reconstruction of Date Feature.
  X_test_future=X_test_future.drop(columns=['Month','Year']) # Drop Month and Year from Original dataset.
  dtest_future = xgb.DMatrix(X_test_future)
  predictions_future = np.round(model.predict(dtest_future)) # prediction for next month
  future_Month_Results=pd.DataFrame({
          'Disease Category': disease_categories.iloc[X_test_future.index]['Disease Category'],
          'Drug Name': drug_names.iloc[X_test_future.index]['Drug Name'],
          'Month': [month] * len(predictions_future),
          'Predicted Sales': predictions_future

      })

     # Concatenate instead of append
  future_results = pd.concat([future_results, future_Month_Results], ignore_index=True)




In [15]:
print(future_results.head())

  Disease Category         Drug Name  Month  Predicted Sales
0      Cholesterol     GLIDABET 80MG      1             86.0
1      Cholesterol    GLUCOZIDE 80MG      1           7877.0
2      Cholesterol  DIAZIDE TAB 80MG      1             35.0
3      Cholesterol       GLIVIC 40MG      1            116.0
4   Cardiovascular      LOWPRES 50MG      1            948.0


In [None]:
# # Make predictions
# predictions = model.predict(dtest)

# # Create a DataFrame with the predictions and drug names
# results = pd.DataFrame({
#     'Drug Name': drug_names.iloc[X_test.index]['Drug Name'],  # Map back to the original Drug Name
#     'Disease Category': disease_categories.iloc[X_test.index]['Disease Category'],  # Map back to the original Disease Category
#     'Predicted Sales': predictions
# })

In [16]:
# Pivot the DataFrame so each drug has predictions in a single row
pivot_results = future_results.pivot_table(
    index=['Disease Category', 'Drug Name'],
    columns='Month',
    values='Predicted Sales'
).reset_index()

#Enhancing readability by renaming for all Predictions
pivot_results.columns = ['Disease Category', 'Drug Name', 'Prediction 1', 'Prediction 2', 'Prediction 3', 'Prediction 4', 'Prediction 5', 'Prediction 6']


In [17]:
from google.colab import files
pivot_results.to_csv('Medicine predictions XGBooster.csv', index=False)
files.download('Medicine predictions XGBooster.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>