<a href="https://colab.research.google.com/github/YasiruMM/Medicine-Prediction-Grp-22/blob/Diabetics_-Data_-Cleaning/Diabetics_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Data Preprocessing**#

###**Imports**###

In [34]:
import pandas as pd
import os
import glob
import re

##Creating a basic structure for the ALPHA GLUCO medicine##

###Creating the basic structure for the xl sheets that contains item code

In [29]:
# Define the column names
column_names = ['Item Code', 'Item Name', 'P Price', 'R Price', 'PurchPrice', 'RetailPrice', 'Total']

# Define the file paths
file_paths = ['03.24.xlsx', '06.24.xlsx', '07.24.xlsx', '08.24.xlsx', '09.24.xlsx', '10.24.xlsx']

# Create an empty list to store the dataframes
dfs = []

for file_path in file_paths:
    try:
        df = pd.read_excel(file_path, names=column_names)
        dfs.append(df)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred while reading {file_path}: {e}")

# Concatenate all the dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

print(combined_df.head())


   Item Code       Item Name  P Price  R Price  PurchPrice  RetailPrice  Total
0      11052    DIABOSE 50MG     44.7     52.8   1745803.2    2062156.8   2934
1      11052    DIABOSE 50MG     45.4     52.8   1737412.6    2020603.2   3185
2      11257  GLUCOBAY 100MG     75.7     47.7      1514.0        954.0      8
3      11257  GLUCOBAY 100MG     75.7     88.5    360710.5     421702.5    684
4      11153   GLUCOBAY 50MG     46.8     26.8       327.6        187.6      0


In [30]:
# Removed the 'Item Code' and 'Total' columns
combined_df = combined_df.drop(columns=['Item Code', 'Total'])

combined_df.to_excel('modified_data.xlsx', index=False, header=True)

print(combined_df.head())


        Item Name  P Price  R Price  PurchPrice  RetailPrice
0    DIABOSE 50MG     44.7     52.8   1745803.2    2062156.8
1    DIABOSE 50MG     45.4     52.8   1737412.6    2020603.2
2  GLUCOBAY 100MG     75.7     47.7      1514.0        954.0
3  GLUCOBAY 100MG     75.7     88.5    360710.5     421702.5
4   GLUCOBAY 50MG     46.8     26.8       327.6        187.6


###Creating the basic structure for the xl sheets that do not contains item code###

In [31]:
column_names = ['Item Name', 'P Price', 'R Price', 'PurchPrice', 'RetailPrice', 'Total']

file_paths = ['02.24.xlsx', '04.24.xlsx', '05.24.xlsx']


dfs = []

for file_path in file_paths:
    try:
        df = pd.read_excel(file_path, names=column_names)
        dfs.append(df)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred while reading {file_path}: {e}")

combined_df = pd.concat(dfs, ignore_index=True)

print(combined_df.head())


        Item Name  P Price  R Price  PurchPrice  RetailPrice  Total
0    DIABOSE 50MG     44.7     52.8   1745803.2    2062156.8   4243
1    DIABOSE 50MG     45.4     52.8   1737412.6    2020603.2   3427
2  GLUCOBAY 100MG     75.7     47.7      1514.0        954.0      1
3  GLUCOBAY 100MG     75.7     88.5    360710.5     421702.5    753
4   GLUCOBAY 50MG     46.8     26.8       327.6        187.6      0


In [32]:
# Removed the 'Total' columns
combined_df = combined_df.drop(columns=['Total'])

combined_df.to_excel('modified_data_2.xlsx', index=False, header=True)

print(combined_df.head())

        Item Name  P Price  R Price  PurchPrice  RetailPrice
0    DIABOSE 50MG     44.7     52.8   1745803.2    2062156.8
1    DIABOSE 50MG     45.4     52.8   1737412.6    2020603.2
2  GLUCOBAY 100MG     75.7     47.7      1514.0        954.0
3  GLUCOBAY 100MG     75.7     88.5    360710.5     421702.5
4   GLUCOBAY 50MG     46.8     26.8       327.6        187.6


###Combining the two datasets###

In [33]:
# Loaded the two datasets
modified_data = pd.read_excel('modified_data.xlsx')
modified_data_2 = pd.read_excel('modified_data_2.xlsx')

# Combine the datasets
combined_dataset = pd.concat([modified_data, modified_data_2], ignore_index=True)

print(combined_dataset.head())

combined_dataset.to_excel('combined_dataset.xlsx', index=False)

        Item Name  P Price  R Price  PurchPrice  RetailPrice
0    DIABOSE 50MG     44.7     52.8   1745803.2    2062156.8
1    DIABOSE 50MG     45.4     52.8   1737412.6    2020603.2
2  GLUCOBAY 100MG     75.7     47.7      1514.0        954.0
3  GLUCOBAY 100MG     75.7     88.5    360710.5     421702.5
4   GLUCOBAY 50MG     46.8     26.8       327.6        187.6


###Finalized dataset for ALPHA GLUCO###

In [35]:
def extract_dosage(item_name):
    match = re.search(r'(\d+\w*)MG', item_name, re.IGNORECASE)
    if match:
        return match.group(1) + "MG"
    else:
        return None

combined_dataset['Dosage'] = combined_dataset['Item Name'].apply(extract_dosage)
print(combined_dataset.head())

combined_dataset.to_excel('ALPHA GLUCO.xlsx', index=False)


        Item Name  P Price  R Price  PurchPrice  RetailPrice Dosage
0    DIABOSE 50MG     44.7     52.8   1745803.2    2062156.8   50MG
1    DIABOSE 50MG     45.4     52.8   1737412.6    2020603.2   50MG
2  GLUCOBAY 100MG     75.7     47.7      1514.0        954.0  100MG
3  GLUCOBAY 100MG     75.7     88.5    360710.5     421702.5  100MG
4   GLUCOBAY 50MG     46.8     26.8       327.6        187.6   50MG


##Creating a basic structure for the SULFON medicine##
