<a href="https://colab.research.google.com/github/YasiruMM/Medicine-Prediction-Grp-22/blob/Diabetics_-Data_-Cleaning/Diabetics_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Imports**#

In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = 'Structured_Diabetics.xlsx'
df = pd.read_excel(file_path)

df.head()

Unnamed: 0,Disease Category,Drug Category,Drug Name,Dosage,Retail Price,Purchase Price,Sales,Date
0,Diabetics,ALPHA GLUCO,DIABOSE 50MG,50MG,2020603.2,1737412.6,3427.0,2.24
1,Diabetics,ALPHA GLUCO,GLUCOBAY 100MG,100MG,954.0,1514.0,1.0,2.24
2,Diabetics,ALPHA GLUCO,GLUCOBAY 100MG,100MG,421702.5,360710.5,753.0,2.24
3,Diabetics,ALPHA GLUCO,GLUCOBAY 50MG,50MG,187.6,327.6,0.0,2.24
4,Diabetics,ALPHA GLUCO,GLUCOBAY 50MG,50MG,13511.0,21434.4,52.0,2.24


#**Extract Unique pairs of drug names and categories**#

In [3]:
pairs = df.groupby('Drug Category')['Drug Name'].unique().to_dict()
print(pairs)

{'ALPHA GLUCO': array(['DIABOSE 50MG', 'GLUCOBAY 100MG', 'GLUCOBAY 50MG'], dtype=object), 'SULFON': array(['AMARYL 1MG', 'AMARYL 2MG', 'AMARYL 3MG', 'AZUKON MR 30MG',
       'CONSUCON 80MG', 'DAONIL 5MG', 'DIAMICRON 80MG',
       'DIAMICRON MR 30MG', 'DIAMICRON MR 60MG', 'DIANORM 80MG',
       'DIANORM OD 30', 'DIANORM OD 60', 'DIAPRIDE 1MG', 'DIAPRIDE 2MG',
       'DIATICA 80MG', 'DIAZIDE TAB 40MG', 'DIAZIDE TAB 80MG',
       'EUGLIM 2MG', 'EUGLIM 4MG', 'EUGLIZIP 5MG', 'GD-CARE 5MG',
       'GETRIL 1MG', 'GETRIL 2MG', 'GETRIL 3MG', 'GETRIL 4MG',
       'GLEMAZ 2MG', 'GLEMAZ 4MG', 'GLIBENCOMIDE 5MG (INGA)',
       'GLICAD 40MG', 'GLICLAZIDE 40MG', 'GLICLAZIDE 40MG (IPCA)',
       'GLICLAZIDE 80MG', 'GLICLAZIDE 80MG (SPMC)',
       'GLICLAZIDE 80MG SPMC', 'GLICLAZIDE MR 30MG',
       'GLICLAZIDE MR 60MG (IPCA)', 'GLIDABET 80MG', 'GLIMEPRIDE 2MG',
       'GLIMEPRIDE 2MG (USV)', 'GLIMIPRIDE 4MG', 'GLIPIZ  5MG',
       'GLIPIZIDE 5MG (USV)', 'GLITROL CR 30MG', 'GLITROL CR 60MG',
       'GL

##**Identifying the most frequent drug category and drug name**##

In [4]:
most_frequent_category = df['Drug Category'].mode()[0]
most_frequent_drug_name = df['Drug Name'].mode()[0]

##**Filling the missing/invalid drug name or drug category**##

In [5]:
def missing_values(row,category_to_names, most_frequent_category, most_frequent_drug_name):

  category = row['Drug Category']
  drug_name = row['Drug Name']

  # Handle missing 'Drug Category'
  if pd.isnull(category) or category not in category_to_names:
        row['Drug Category'] = most_frequent_category  # Fill with most frequent category

    # Handle missing or invalid 'Drug Name'
  if pd.isnull(drug_name) or isinstance(drug_name, (int, float)):
        row['Drug Name'] = most_frequent_drug_name  # Fill with most frequent drug name
  return row

##**Finding the most frequent dosage for each drug name**##

In [6]:
most_frequent_dosages = df.groupby('Drug Category')['Dosage'].agg(lambda x: x.mode()[0]).to_dict()
print(most_frequent_dosages)

{'ALPHA GLUCO': '50MG', 'SULFON': '80MG', 'THIAZOL': '15MG'}


##**Function to fill missing & irrelevant 'Dosages**##

In [7]:
def fill_invalid_dosage(row, dosage_map):
    dosage = row['Dosage']
    category=row['Drug Category']

    # Check if dosage is missing or invalid (you can add specific criteria for invalid values)
    if pd.isnull(dosage) or dosage== '':
        # If invalid or missing, use the most frequent dosage for the drug
        row['Dosage'] = dosage_map.get(category,None)  #use most fequent dosage

    return row

##**Handleing the numerical fields**##

In [8]:
# retrieve medians of specific numerics under each drug category.
median_values = df.groupby('Drug Category').agg({
    'Retail Price': 'median',
    'Purchase Price': 'median',
    'Sales': 'median'
}).to_dict()
print(median_values)

{'Retail Price': {'ALPHA GLUCO': 13082.5, 'SULFON': 9580.4, 'THIAZOL': 2094.2}, 'Purchase Price': {'ALPHA GLUCO': 18509.4, 'SULFON': 8204.4, 'THIAZOL': 2268.0}, 'Sales': {'ALPHA GLUCO': 41.5, 'SULFON': 67.0, 'THIAZOL': 39.5}}


In [9]:
# Step 2: Function to handle missing or irrelevant values
def fill_numeric_values(row, median_dict):
    category = row['Drug Category']

    # Handle Retail Price
    if pd.isnull(row['Retail Price']) or row['Retail Price'] <= 0 or not isinstance(row['Retail Price'], (int, float)):  # Null or invalid values handling
        row['Retail Price'] = median_dict['Retail Price'].get(category, df['Retail Price'].median())

    # Handle Purchase Price
    if pd.isnull(row['Purchase Price']) or row['Purchase Price'] <= 0 or not isinstance(row['Purchase Price'], (int, float)):  # Null or invalid values handling
        row['Purchase Price'] = median_dict['Purchase Price'].get(category, df['Purchase Price'].median())

    # Handle Sales
    if pd.isnull(row['Sales']) or row['Sales'] <= 0 or not isinstance(row['Sales'], (int, float)):  # Null or invalid values handling
        row['Sales'] = median_dict['Sales'].get(category, df['Sales'].median())

    return row