In [None]:
"""
Code for data cleaning. 
Main goal is to find and create ONE 'one_name' for one company but with different variants of:

1. Company ID (sold_to_customer), 
2. Name spellings (sold_to_customer_n), 

each 'one_name' has to have eventually one definition for channel

1. Type (Direct or Indirect)
2. Type_with_channel (Direct - Direct or Indirect - Other or Indirect- Channel)
3. Company_type according to our specification (End user, Distributor, VAR, OEM, EPC etc.)
4. OPCO


"""

In [130]:
"""
importing data from two files which were prepared accordingly from SAP system


order's columns:

['company_code_n', 'year_month', 'Date', 'FY', 'Sales group_case',
       'customer_group_code', 'Column1', 'BU Group OAC', 'BU Group_case',
       'bu2', 'bu_n', 'sales_person_n', 'sales_order_so', 'sold_to_customer',
       'sold_to_customer_n', 'sold_to_region_n', 'eu_industry_n',
       'ec_eu_customer_n', 'eu_region_n', 'ec_eu_industry_n',
       'order_intake_amount_eur', 'OPCO', 'Type', 'Type with Channel',
       'Company time']
       
customer's columns

['Customer', 'Tier', 'Accounts', 'Company', 'Customer Name', 'Code',
       'Customer Group Name', 'Oscar i/d', 'Oscar Type', 'Indirect/Direct',
       'Channel3', 'New Type4', 'Correction_Indirect/Direct',
       'Correction_Channel', 'Correction_ Type', 'T&M', 'Partner page',
       'Tableau', 'Comments', 'Link']


"""

import pandas as pd
import numpy as np

#order = pd.read_excel('data_files/full_order_data.xlsx',sheet_name='Data')
#customer = pd.read_excel('data_files/full_customer_data.xlsx',sheet_name='data')

# import all sheets from an initial file
xlsx = pd.ExcelFile('data_files/initial_companies.xlsx')
dfs = {}
for sheet_name in xlsx.sheet_names:
    dfs[sheet_name] = xlsx.parse(sheet_name)

In [244]:
# names of sheets in file 'data' with customer data and 'legal' with abbreviations
data_df = dfs['data']
legal_forms_df = dfs['legal']

In [245]:
# keeping necessary columns only
customers_df = data_df.loc[:,['Customer', 'Customer Name']]
customers_df.rename(columns = {'Customer' : 'ID', 'Customer Name': 'Initial_Name' }, inplace = True)
# minor modification of Initial Name cleaned from not necessary signs
customers_df['upper'] = customers_df['Initial_Name'].str.upper().str.strip()
customers_df['clean'] = customers_df['Initial_Name'].str.replace('\,|\"|\(|\)' , " ", regex = True).str.strip()
customers_df['upper_clean'] = customers_df['upper'].str.replace('\,|\"|\(|\)' , " ", regex = True).str.strip()
customers_df['list_name'] = customers_df['upper_clean'].str.split()

# column with the last part of the company name (it is legal form often)
customers_df['last_part'] = customers_df['Initial_Name'].apply(lambda x: x.split()[-1])
customers_df['upper_last_part'] = customers_df['last_part'].str.upper().str.strip()

# creating list of all legal abbreviations as it is and in upper case
abbr_legal = list(set(legal_forms_df['Abbr'].tolist()))
upper_abbr_legal = [x.upper() for x in abbr_legal]
upper_abbr_legal = list(set(upper_abbr_legal))

In [206]:
customers_df = customers_df.loc[:,['ID','Initial_Name', 'upper_clean','last_part', 'upper_last_part']]

In [235]:
ls = ['AJ','AO', 'OO']
'O' in ls

False

In [207]:
import numpy as np

In [241]:
def stw(x, abbr):
    result = []
    for a in abbr:
        if x.startswith(a):
            result.append(a)
    if len(result) > 0:
        return result
    else:
        return ''
    


def naming(x, abbr):
    result = 0
    last = x.split()[-1]
    ab_list = []
    for a in abbr:
        if a in x:
            ab_list.append(a)
    if len(ab_list) > 0:
        max_elem = max(ab_list, key=len)
        for ab in ab_list:
            if ab in x.split():   
                if ab == last:
                    result = x.split(ab)[0].strip()
                    return result
                else:
                    if len(ab) > 2:
                        result = x.split(ab)[0].strip()
                    else:
                        if not x.startswith(ab):
                            result = x.split(ab)[0].strip()
                        else:
                            result = x
            else:
                if x.endswith(ab):
                    result = x.split(ab)[0].strip()
                    return result
                else:
                    result = x
    else:
        result = x
    return result

In [242]:
customers_df['TEST_1'] = customers_df['Initial_Name'].apply(lambda x: naming(x,abbr_legal))
customers_df['TEST_2'] = customers_df['upper_clean'].apply(lambda x: naming(x,upper_abbr_legal))
customers_df['TEST_3'] = customers_df['Initial_Name'].apply(lambda x: stw(x,abbr_legal))
customers_df['TEST_4'] = customers_df['upper_clean'].apply(lambda x: stw(x,upper_abbr_legal))

In [246]:
# max length abbreviation found
customers_df['abbr'] = customers_df['Initial_Name'].apply(lambda x: max([abbr for abbr in abbr_legal if abbr in x], key=len) if [abbr for abbr in abbr_legal if abbr in x] else None)
# list of all abbreviation found
customers_df['abbr_list'] = customers_df['Initial_Name'].apply(lambda x: [abbr for abbr in abbr_legal if abbr in x] if [abbr for abbr in abbr_legal if abbr in x] else None)


# max length abbreviation found - clean names
customers_df['abbr_clean'] = customers_df['clean'].apply(lambda x: max([abbr for abbr in abbr_legal if abbr in x], key=len) if [abbr for abbr in abbr_legal if abbr in x] else None)
# list of all abbreviation found - clean names
customers_df['abbr_list_clean'] = customers_df['clean'].apply(lambda x: [abbr for abbr in abbr_legal if abbr in x] if [abbr for abbr in abbr_legal if abbr in x] else None)


# max length abbreviation transformed in Upper found
customers_df['upper_abbr'] = customers_df['upper'].apply(lambda x: max([abbr for abbr in upper_abbr_legal if abbr in x], key=len) if [abbr for abbr in upper_abbr_legal if abbr in x] else None)
# list of all abbreviation transformed in Upper found
customers_df['upper_abbr_list'] = customers_df['upper'].apply(lambda x: [abbr for abbr in upper_abbr_legal if abbr in x] if [abbr for abbr in upper_abbr_legal if abbr in x] else None)

# max length abbreviation transformed in Upper found - clean names
customers_df['upper_abbr'] = customers_df['upper_clean'].apply(lambda x: max([abbr for abbr in upper_abbr_legal if abbr in x], key=len) if [abbr for abbr in upper_abbr_legal if abbr in x] else None)
# list of all abbreviation transformed in Upper found - clean names
customers_df['upper_abbr_list'] = customers_df['upper_clean'].apply(lambda x: [abbr for abbr in upper_abbr_legal if abbr in x] if [abbr for abbr in upper_abbr_legal if abbr in x] else None)

customers_df['LAST'] = customers_df['upper_last_part'].apply(lambda x: max([abbr for abbr in upper_abbr_legal if abbr in x], key=len) if [abbr for abbr in upper_abbr_legal if abbr in x] else None)

In [243]:
customers_df

Unnamed: 0,ID,Initial_Name,upper_clean,last_part,upper_last_part,TEST_1,TEST_2,abbr,abbr_list,upper_abbr,upper_abbr_list,LAST,TEST_3,TEST_4
0,200196804,Andritz Metals Germany GmbH,ANDRITZ METALS GERMANY GMBH,GmbH,GMBH,Andritz Metals Germany,ANDRITZ METALS GERMANY G,GmbH,[GmbH],GMBH,"[MBH, D, GMBH, AL]",GMBH,,
1,200328893,Marel Poultry BV,MAREL POULTRY BV,BV,BV,Marel Poultry,MAREL POULTRY,BV,[BV],OU,"[OU, BV]",BV,,
2,200325082,"""GREN Latvia"" SIA",GREN LATVIA SIA,SIA,SIA,"""GREN Latvia""",GREN LATVIA,SIA,[SIA],SIA,[SIA],SIA,,
3,200270728,"""IWPOL-BIS"" Pawel Jadach",IWPOL-BIS PAWEL JADACH,Jadach,JADACH,"""IWPOL-BIS"" Pawel Jadach",IWPOL-BIS PAWEL JADACH,,,D,[D],D,,
4,200240049,"""Paged-Sklejka"" S.A.",PAGED-SKLEJKA S.A.,S.A.,S.A.,"""Paged-Sklejka""",PAGED-SKLEJKA,S.A.,"[S., S.A, S.A., A.]",S.A.,"[AG, S., S.A, D, SK, S.A., A.]",S.A.,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14316,200280916,Zweckverband Bodensee Wasserversorgung,ZWECKVERBAND BODENSEE WASSERVERSORGUNG,Wasserversorgung,WASSERVERSORGUNG,Zweckverband Bodensee Wasserversorgung,ZWECKVERBAND BODENSEE WASSERVERSORGUNG,,,SE,"[SE, D, AS]",SE,,
14317,200281224,Zweckverband Bodensee-Wasserversorgung,ZWECKVERBAND BODENSEE-WASSERVERSORGUNG,Bodensee-Wasserversorgung,BODENSEE-WASSERVERSORGUNG,Zweckverband Bodensee-Wasserversorgung,ZWECKVERBAND BODENSEE-WASSERVERSORGUNG,,,SE,"[SE, D, AS]",SE,,
14318,200084476,Zweckverband Muellverwertung Schwandorf,ZWECKVERBAND MUELLVERWERTUNG SCHWANDORF,Schwandorf,SCHWANDORF,Zweckverband Muellverwertung Schwandorf,ZWECKVERBAND MUELLVERWERTUNG SCHWANDORF,,,D,[D],D,,
14319,200270641,ZWP Zahnradwerk Pritzwalk GmbH,ZWP ZAHNRADWERK PRITZWALK GMBH,GmbH,GMBH,ZWP Zahnradwerk Pritzwalk,ZWP ZAHNRADWERK PRITZWALK G,GmbH,[GmbH],GMBH,"[MBH, D, GMBH, AL]",GMBH,,


In [247]:
writer = pd.ExcelWriter('data_files/the_first_iteration.xlsx', engine='xlsxwriter')
customers_df.to_excel(writer, sheet_name='1')
writer.save()

In [None]:
"""
Cleaning old names. Step 1 - get a current SAP name of companies for Company ID (sold_to_customer)

1. Sort by FY (datetime), descending order
2. Creat 'company_id' list
3. check in cycle all lines of 'order' dataframe 
4. if Company ID (sold_to_customer) is in the list, then delete line  

"""

In [None]:
#forting by date, descending order for keeping the newest
order_sorted_by_FY_date = order.sort_values(by='FY', ascending=False).reset_index(drop = True)
#it is necessary exlude all not essentional inforamtion:
order_sorted_by_FY_date = order_sorted_by_FY_date.loc[:,['FY','sold_to_customer', 'sold_to_customer_n', 'OPCO']]

In [None]:
column_index = order_sorted_by_FY_date.columns.get_loc('sold_to_customer') #determine Company ID column index for iloc
company_id_exist_list = [] #creat 'company_id' list
result_df = order_sorted_by_FY_date.copy()

rng = len(order_sorted_by_FY_date)

for i in range(rng): #cycle for deleting repeting Company ID, saving the newest data
    
    if order_sorted_by_FY_date.iloc[i,column_index] not in company_id_exist_list:
        company_id_exist_list.append(order_sorted_by_FY_date.iloc[i,column_index])
    else:
        result_df = result_df.drop(i)
        print(i, rng, "Working : %.1f%% \r" % (i * 100 / rng), end='')

In [None]:
"""
easier way to do the same thing
sevral seconds vs 30 minutes

"""
result2_df = order_sorted_by_FY_date.copy()
result2_df.drop_duplicates(subset='sold_to_customer', keep='first', inplace=True, ignore_index=True)

writer = pd.ExcelWriter('data_files/The_newest_names.xlsx', engine='xlsxwriter')
result_df.to_excel(writer, sheet_name='1')
result2_df.to_excel(writer, sheet_name='2')
writer.save()

In [None]:
#to be continiued

In [None]:
customer_uniq = customer.copy()
customer_uniq = customer_uniq.loc[:, ['Customer', 'Customer Name']]
merg_customer_uniq = customer_uniq.merge(result2_df, how='left', left_on='Customer', right_on='sold_to_customer')
merg_customer_uniq.dropna()
result2_df.iloc[11,2].upper().replace('-' , ' ').replace('"' , '').replace("," , " ").replace("." , " ").strip().split()
#pattern = '|'.join(['-','"', ',' ,'.'])
result2_df.dropna(inplace = True)
result2_df['sold_to_customer_n'] = result2_df['sold_to_customer_n'].str.replace('\-|\,|\.|\"' , " ", regex = True)
result2_df['sold_to_customer_n'] = result2_df['sold_to_customer_n'].str.upper().str.strip()  #.str.split()