In [1]:
import os
import pandas as pd
from typing import List

Data directories

In [2]:
working_dir = os.getcwd()
parent_dir = "/".join(working_dir.split("/")[:-1])
data_dir = f"{parent_dir}/data"
data_list = os.listdir(data_dir)

print(f"[Info:] Parent Directory: {parent_dir}")
print(f"[Info:] Working Directory: {working_dir}")
print(f"[Info:] Data Directory: {data_list}")

[Info:] Parent Directory: /home/arfan/Documents/Others/Omegga/project
[Info:] Working Directory: /home/arfan/Documents/Others/Omegga/project/dev
[Info:] Data Directory: ['final_data.xlsx', 'raw_data.xlsx']


Dataset generation blocks

In [3]:
def generate_dataset(
    df: pd.DataFrame,
    sample_size: int,
    ordering_col: str,
    columns_to_drop: List[str],
    dropping_axis: int = 1,
    randomness: int = 42,
    replacement: bool = True,
    reset_index: bool = True) -> pd.DataFrame:
    """generating sample dataset from the original one.
    This function accepts pandas dataframe and selects
    rows with sampling. returns similar type of pandas
    dataframe.
    
    Note: Original dataset is not provided for privacy

    Args:
        df (pd.DataFrame): input pandas tabular dataframe
        sample_size (int): sample size to extract data from original data
        dropping_axis (int): accpets 0 or 1. to drop following column or row 
                                if 1 then column wise row dropping otherwise 
                                vice versa
        randomness (int): integer to introduce randomness in the data
        ordering_col (str): column to follow ordering of the rows
        columns_to_drop (List[str]): list of columns to drop
        replacement (bool, optional): whether sampling with replacement or not. 
                                        Defaults to True.
        reset_index (bool, optional): if resetting of the indexes needed or not. 
                                        Defaults to True.

    Returns:
        pd.DataFrame: resulting data into pandas tabular dataframe
    """
    df = df.sample(n=sample_size, replace=replacement, random_state=randomness)
    df.drop(columns_to_drop, axis=dropping_axis, inplace=True)
    df.sort_values(ordering_col, inplace=True)
    if reset_index:
        df.reset_index(inplace=True)
    
    return df

In [4]:
for file in data_list:
    if "raw_data" in file:
        df = pd.read_excel(f"{data_dir}/{file}")
    else:
        other_df = df = pd.read_excel(f"{data_dir}/{file}")

In [5]:
df.shape

(6047, 260)

In [6]:
sampled_df = generate_dataset(
    df=df,
    sample_size=1000,
    ordering_col="order",
    columns_to_drop=["AnalyzerID"],
    dropping_axis=1,
    randomness=42,
    replacement=True,
    reset_index=True
)

In [7]:
sampled_df.shape

(1000, 260)

In [8]:
sampled_df.columns[2:]

Index(['CA (HPLC)',     1350.16,     1352.64,     1355.14,     1357.64,
           1360.16,     1362.68,     1365.21,     1367.75,      1370.3,
       ...
           2472.75,      2481.1,     2489.51,     2497.97,      2506.5,
           2515.08,     2523.72,     2532.42,     2541.18,        2550],
      dtype='object', length=258)

In [9]:
sampled_df.to_excel(f"{data_dir}/final_data.xlsx", index=False, columns=sampled_df.columns[2:])