# General Functions for handling excel and csv

## csv files with encoding other than utf8

In [None]:
!pip install chardet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd

In [None]:
#Use chardet to detect the encoding in the file

import chardet

file = "/content/drive/MyDrive/data/Oscars-demographics-DFE.csv"
with open(file, 'rb') as f:
    encoding = chardet.detect(f.read())
print(encoding)

{'encoding': 'ISO-8859-1', 'confidence': 0.7289274470020289, 'language': ''}


In [None]:
# There will be an error reading from file 
df_utf8 = pd.read_csv(file)

UnicodeDecodeError: ignored

In [None]:
# Use the same method to read based on the encoding detected earlier
df_w_encoding = pd.read_csv(file, encoding=encoding['encoding'])
df.head()

Unnamed: 0,Field,Field Name,Data element,Checktable,Datatype,Length,Decimals,SheetName,Present
0,MANDT,Client,MANDT,T000,CLNT,3,0,BKPF,1
1,BUKRS,Company Code,BUKRS,T001,CHAR,4,0,BKPF,1
2,BELNR,Accounting Document Number,BELNR_D,,CHAR,10,0,BKPF,1
3,GJAHR,Fiscal Year,GJAHR,,NUMC,4,0,BKPF,1
4,BLART,Document Type,BLART,T003,CHAR,2,0,BKPF,1


## Excel with multiple sheet of data 

- Using a Sample Data Dictionary as example where each sheet rrepresents an SAP table which contains the data definition of the fields within
- When sheets_name = None parameter is passed in, pandas will read the data in all the sheets as a df and create a dictionary with the sheetname as key and the df as the value

In [None]:
# Reading in the data
sample_SAP_data_dict = "/content/drive/MyDrive/data/SAP Data Dictionary Selected Tables.xlsx"
data_sheet_dict = pd.read_excel(sample_SAP_data_dict, sheet_name=None)

In [None]:
#Display the names of each sheet in excel
list_sheets = list(data_sheet_dict.keys())
print(f"There are {len(list_sheets)} sheets in the excel file")
print(f"The sheets are {list_sheets}")

There are 11 sheets in the excel file
The sheets are ['BKPF', 'BSAK', 'BSEG', 'EKPO', 'EKKO', 'EBAN', 'EKBE', 'EKKN', 'ESLL', 'ESLH', 'MAKT']


In [None]:
# Each individual df is stored within the corresponding value 
data_sheet_dict['BSEG'].head()

Unnamed: 0,Field,Field Name,Data element,Checktable,Datatype,Length,Decimals
0,MANDT,Client,MANDT,T000,CLNT,3,0
1,BUKRS,Company Code,BUKRS,T001,CHAR,4,0
2,BELNR,Accounting Document Number,BELNR_D,,CHAR,10,0
3,GJAHR,Fiscal Year,GJAHR,,NUMC,4,0
4,BUZEI,Number of Line Item Within Accounting Document,BUZEI,,NUMC,3,0


One application of using the information is to create a table showing the common columns among each table where the tables are represented as rows and columns represented as columns

In [None]:
df = data_sheet_dict['BSEG']

In [None]:
df['Present'] = 1
df['SAP_Table'] = 'BSEG'

In [None]:
df[['SAP_Table','Field', 'Present']].pivot_table(columns='Field',
                                     values='Present',
                                     index='SAP_Table')

Field,ABPER,ABSBT,AGZEI,ALTKT,ANBWA,ANFAE,ANFBJ,ANFBN,ANFBU,ANLN1,...,ZEKKN,ZFBDT,ZINKZ,ZLSCH,ZLSPR,ZOLLD,ZOLLT,ZTERM,ZUMSK,ZUONR
SAP_Table,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BSEG,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
def extract_columns_info(df, sheet_name, list_column_field):
    df_temp = df.copy()
    df_temp['SAP_Table'] = sheet_name
    df_temp['Present'] = 1
    columns_select = ['SAP_Table'] + list_column_field + ['Present']
    print(columns_select)

    return df_temp[columns_select].pivot_table(columns=list_column_field,
                                     values='Present',
                                     index=['SAP_Table'])

In [None]:
list_column_field = ['Field']
list_df = []
for sheet_name, df in data_sheet_dict.items():
    print(df)
    print(sheet_name)
    df_processed = extract_columns_info(df,sheet_name, list_column_field)
    list_df.append(df_processed)

        Field                          Field Name Data element Checktable  \
0       MANDT                              Client        MANDT       T000   
1       BUKRS                        Company Code        BUKRS       T001   
2       BELNR          Accounting Document Number      BELNR_D        NaN   
3       GJAHR                         Fiscal Year        GJAHR        NaN   
4       BLART                       Document Type        BLART       T003   
..        ...                                 ...          ...        ...   
110     KNUMV    Number of the document condition        KNUMV        NaN   
111  OINETNUM  Exchange - netting document number   OIA_NETNUM        NaN   
112   OINJAHR                         Fiscal Year        GJAHR        NaN   
113    OININD    Movement-based netting indicator      OIA_NET        NaN   
114     RECHN             Indicator: post invoice        XRECH        NaN   

    Datatype  Length  Decimals  
0       CLNT       3         0  
1       C

In [None]:
df_consolidated = pd.concat(list_df)
df_consolidated.fillna("", inplace=True)

In [None]:
columns = ['BELNR','BUZEI','GJAHR','EBELN','EBELP','PACKNO','INTROW','MATNR']
df_consolidated.loc[:,columns]

Field,BELNR,BUZEI,GJAHR,EBELN,EBELP,PACKNO,INTROW,MATNR
SAP_Table,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BKPF,1.0,,1.0,,,,,
BSAK,1.0,1.0,1.0,1.0,1.0,,,
BSEG,1.0,1.0,1.0,1.0,1.0,,,1.0
EKPO,,,,1.0,1.0,1.0,,1.0
EKKO,,,,1.0,,,,
EBAN,,,,1.0,1.0,1.0,,1.0
EKBE,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
EKKN,,,,1.0,1.0,,,
ESLL,1.0,,,,,1.0,1.0,
ESLH,,,,1.0,1.0,1.0,,
