In [0]:
import pandas as pd




In [0]:
def load_xlsx_file(input_path:str, header:bool=None, skiprows:int=1, sheet_name:str='Operations Forecast_new' )->pd.DataFrame:
    """
    loads the xlsx file
    Arguments
    path to the excel file 
    headers and rows constraints 
    sheet names as the file may contains multiple sheets
    Returns
    pd.DataFrame: Input Datataframe
   """
    input_xls_df = pd.read_excel(input_path, engine="openpyxl", header=header, skiprows=skiprows, sheet_name=sheet_name)
    return input_xls_df

In [0]:
def remove_unwanted_cols(input_xls_df:pd.DataFrame,columns:list=[0,1], axis:int=1 )->pd.DataFrame:
    """ 
    Remove unwanted columns from input_xls_df dataframe
    Arguments
    input DataFrame as pd
    list of colmuns to be dropped
    Returns
    a simplified dataframe 
    """
    simplified_df = input_xls_df.drop(columns=columns, axis=axis)
    return simplified_df

In [0]:
def transpose_df(simplified_df:pd.DataFrame)->pd.DataFrame:
  """ 
  Transpose the simplified dataframe
  Arguments 
  dataframe
  Returns
  Transposed Dataframe 
  """
  df_transposed = simplified_df.transpose()
  return df_transposed

In [0]:
#set the first row as column name
def set_row1_as_columns(df_transposed:pd.DataFrame)->pd.DataFrame:
  """
  select first row as column names
  Arguments
  pd dataframe
  Returns 
  dataframe with columns names instead of an array of numbers
  """
  col_names = df_transposed.iloc[0]
  simplified_new_col = df_transposed[1:].copy()
  simplified_new_col = simplified_new_col.set_axis(col_names,axis=1).rename_axis(None, axis=1)
  return simplified_new_col


In [0]:
def rename_col1(simplified_new_col:pd.DataFrame, col_name:str='Dates')->pd.DataFrame:
  """
  rename the first column of simplified_df 
  args
  pandas DataFrame 
  column new names as str 
  returns
  DataFrame with dates as first column 
  """
  date_renamed_df = simplified_new_col.rename({simplified_new_col.columns[0]:col_name}, axis=1)
  return date_renamed_df


In [0]:
def select_target_col(date_renamed_df:pd.DataFrame,target_cols:list=['Dates','Dry Fc','Dry Actuals','Fresh Fc','Fresh','Frozen Fc','Frozen', 'Ultrafresh Fc',	'Ultrafresh'],axis=1)->pd.DataFrame:
  """
  select only useful column names 
  Arguments
  dataframe 
  list of targeted columns 
  Returns 
  dataframe with targeted columns only 
  """
  target_df=date_renamed_df[target_cols]
  return target_df


In [0]:
#Now that I have the target dataframe, we can proceed by cleanning the nan and resetting the index, the function apply on the whole dataset
def drop_nan(target_df:pd.DataFrame, axis:int=1)->pd.DataFrame:
  """
  Drop nan values and reset the indexes 
  Arguments
  target_df dataframe
  Returns 
  clean dataframe without nan values 
  """
  clean_target_df=target_df.dropna().reset_index(drop=True)
  return clean_target_df


In [0]:
def remove_duplicated_rows(clean_target_df, axis:int=1)->pd.DataFrame:
  """
  Checked duplicated rows, and drop all the duplicates in case 
  Argument
  clean_Target_df 
  Returns 
  dataframe with unique datapoints
  """
  clean_data = clean_target_df.drop_duplicates()
  return clean_data


In [0]:

def create_master_df(input_path:str, header:bool=None, skiprows:int=1, sheet_name:str='Operations Forecast_new',cast_cols=True, cols_to_cast:list = ['Dry Fc', 'Dry Actuals', 'Fresh Fc', 'Fresh', 'Frozen Fc','Frozen', 'Ultrafresh Fc', 'Ultrafresh']):
    """
    master function which iterates through each function
    Arguments 
    path to the xlsx file
    Returns 
    input dataframe for the master function
    """
    input_df = load_xlsx_file(input_path=input_path, header=header, skiprows=skiprows, sheet_name=sheet_name)
    input_df = remove_unwanted_cols(input_df)
    input_df = transpose_df(input_df)
    input_df = set_row1_as_columns(input_df)
    input_df = rename_col1(input_df)
    input_df = select_target_col(input_df)
    input_df = drop_nan(input_df)
    input_df = remove_duplicated_rows(input_df)
    if cast_cols:
      input_df = input_df.astype({cols_to_cast[i]: 'float' for i in range(len(cols_to_cast))})
    return input_df

In [0]:
#xlsx_dir = '/dbfs/mnt/dataplatform/acc/DataScience/sandbox/ebiz/Inbound_FC/data/input/Ecom-Ops-status_2022.xlsx'

In [0]:
#df_test_master = create_master_df(xlsx_dir)

In [0]:
# df_test_master