# 1. Import Data 

## Visualisation of Raw Data (acctt) - Accountants

In [1]:
import pandas as pd

acctt = pd.read_csv('ACC.csv')
accts = acctt['ACC'].unique() 
print accts, len(accts) # print unique accountants and total no

acctt

['ANGELA' 'CAIYEAN' 'CHWEELIAN' 'DINA' 'GERALDINE' 'HOAYBEE' 'JASON'
 'JOSEPH' 'RAYMOND' 'SEETLEE' 'SERENE' 'SIEWJIN'] 12


Unnamed: 0,Accountant,ACC,RANK,RATE,PRODUCTIVITY
0,Angela,ANGELA,Junior,15,0.0
1,Cai Yean,CAIYEAN,Senior,35,71.6
2,Chwee Lian,CHWEELIAN,Middle,25,0.0
3,Dina,DINA,Senior,35,67.79
4,Geraldine,GERALDINE,Senior,35,0.0
5,Hoay Bee,HOAYBEE,Middle,25,38.62
6,Jason,JASON,Junior,15,42.85
7,Joseph,JOSEPH,Junior,15,0.0
8,Raymond,RAYMOND,Junior,15,0.0
9,Seet Lee,SEETLEE,Senior,35,86.82


## Visualisation of Raw Data (projt) - Project Breakdown

In [2]:
projt = pd.read_csv('PI.csv')
projt.drop(['HR'], axis=1, inplace=True)
projt.rename(columns={'HR_ROUND': 'HR'}, inplace=True)

projt.head(10)

Unnamed: 0,PROJ,FEE,Accountant,PROD,HR
0,3H aircon,3900,Angela,0.0,29.5
1,3H aircon,3900,Cai Yean,71.6,35.5
2,3H aircon,3900,Seet Lee,86.82,6.0
3,Active Links Airconditioning Engrg,2900,Angela,0.0,2.5
4,Active Links Airconditioning Engrg,2900,Seet Lee,86.82,6.0
5,Active Links Airconditioning Engrg,2900,Siew Jin,0.0,3.5
6,ADCT Technologies,6500,Raymond,0.0,42.0
7,ADCT Technologies,6500,Seet Lee,86.82,11.5
8,ADCT Technologies,6500,Joseph,0.0,16.0
9,ADCT Technologies,6500,Siew Jin,0.0,19.5


## Capitalise projt 's accountant names

In [3]:
projt = pd.merge(projt, acctt[['Accountant','ACC']], how='left', on=['Accountant'])
# print projt.head()

projt.drop(['Accountant'], axis=1, inplace=True)
projt.head()

Unnamed: 0,PROJ,FEE,PROD,HR,ACC
0,3H aircon,3900,0.0,29.5,ANGELA
1,3H aircon,3900,71.6,35.5,CAIYEAN
2,3H aircon,3900,86.82,6.0,SEETLEE
3,Active Links Airconditioning Engrg,2900,0.0,2.5,ANGELA
4,Active Links Airconditioning Engrg,2900,86.82,6.0,SEETLEE


# 2. Generate Summary Stats for Data Cleaning

## Count unique projects (& No. of accountants for each proj)

In [4]:
projects = projt['PROJ'].unique()
print "No. of unique projs:", len(projects)
print projt['PROJ'].value_counts().head(10)
projt['PROJ'].value_counts().tail(10)

No. of unique projs: 97
Linear Precision                   5
Jia Yi Air-Conditioning            5
Linear Tooling                     5
Ultrachem (S) Private Limited      5
Central Granite Industries         4
Perma Shipping Line (Singapore)    4
Netball Singapore                  4
Cicada Private Limited             4
ADCT Technologies                  4
Aldon Technologies Services        4
Name: PROJ, dtype: int64


Changi Sailing Club                  1
Raffles Treats Pte Limited           1
Pace Flow Control                    1
SSA_Water Polo                       1
World Auto                           1
Chelliah & Kiang                     1
Innochem                             1
Bachmann                             1
StarChase Motorsports (Singapore)    1
Institute of Shipping Management     1
Name: PROJ, dtype: int64

## Denormalize projt table by compiling accountant names (delimiter = ', ')

In [5]:
# http://stackoverflow.com/questions/20625582/how-to-deal-with-this-pandas-warning
pd.options.mode.chained_assignment = None  # default='warn'

print "prev row count:", projt.shape
# http://stackoverflow.com/questions/27298178/concatenate-strings-from-several-rows-using-pandas-groupby
projt['ACCS'] = projt[['PROJ','ACC']].groupby(['PROJ'])['ACC'].transform(lambda x: ', '.join(x))
projt['TOTAL_HR'] = projt[['PROJ','HR']].groupby(['PROJ'])['HR'].transform(lambda x: sum(x))

result = projt.drop_duplicates(['PROJ'])
result.drop(['ACC','HR'], axis=1, inplace=True)
print "post row count:", result.shape
result.head(10)

prev row count: (242, 5)
post row count: (97, 5)


Unnamed: 0,PROJ,FEE,PROD,ACCS,TOTAL_HR
0,3H aircon,3900,0.0,"ANGELA, CAIYEAN, SEETLEE",71.0
3,Active Links Airconditioning Engrg,2900,0.0,"ANGELA, SEETLEE, SIEWJIN",12.0
6,ADCT Technologies,6500,0.0,"RAYMOND, SEETLEE, JOSEPH, SIEWJIN",89.0
10,Additions Lifestyle,2900,71.6,"CAIYEAN, ANGELA, RAYMOND",83.5
13,Aidha Ltd,1900,38.62,"HOAYBEE, SEETLEE, JASON",75.0
16,Aim Aircon Engineering,3000,0.0,"RAYMOND, CAIYEAN, ANGELA",88.5
19,Aim Fire Systems Engineering,3000,71.6,"CAIYEAN, ANGELA, SEETLEE",57.5
22,Al- us Enterprises,500,86.82,"SEETLEE, JASON, JOSEPH",16.0
25,Alan Telecom,1200,71.6,"CAIYEAN, HOAYBEE",12.5
27,Aldon Technologies Services,7500,0.0,"RAYMOND, JOSEPH, SEETLEE, SIEWJIN",302.0


## Check for employee duplicates within projt

In [6]:
# print projects with duplicate employees
duplicate = False

for x in range(len(result)):
    acclist = result['ACCS'].iloc[x].split(", ")
    if len(acclist) != len(set(acclist)):
        duplicate = True
        print result.PROJ.iloc[x], acclist
        
if not duplicate:
    print "There are no employee duplicates for each project."

There are no employee duplicates for each project.


## Calc Project Prod Index, Total Cost, Total Accountants

In [7]:
def get_hr(proj, acc):
    return projt[projt.PROJ == proj][projt.ACC == acc].iloc[0]['HR']
def get_rate(acc):
    return acctt[acctt.ACC == acc].iloc[0]['RATE']
def get_total_cost(proj):
    total = 0
    for acc in accts: 
        total += result[result.PROJ == proj].iloc[0][acc+'_COST'] # assume indiv costs are calculated
    return total

In [8]:
for acc in accts:
    result[acc] = [1 if acc in x else 0 for x in result.ACCS]
    string = acc + '_HR'
    result[acc+'_HR'] = [get_hr(result.PROJ.iloc[x],acc) if result[acc].iloc[x] == 1 else 0 for x in range(len(result))]
    rate = get_rate(acc)
    result[acc+'_COST'] = [hr * rate for hr in result[acc+'_HR']]
    
result['TOTAL_COST'] = [get_total_cost(proj) for proj in result.PROJ]
result['TOTAL_PROD'] = [result['FEE'].iloc[x] / result['TOTAL_HR'].iloc[x] for x in range(len(result))]
result['TOTAL_ACC'] = [len(accs.split(", ")) for accs in result.ACCS]
    
accts_extended = []
for acc in accts:
    accts_extended.append(acc)
    accts_extended.append(acc+'_HR')
    accts_extended.append(acc+'_COST')
print accts_extended
result.head()

['ANGELA', 'ANGELA_HR', 'ANGELA_COST', 'CAIYEAN', 'CAIYEAN_HR', 'CAIYEAN_COST', 'CHWEELIAN', 'CHWEELIAN_HR', 'CHWEELIAN_COST', 'DINA', 'DINA_HR', 'DINA_COST', 'GERALDINE', 'GERALDINE_HR', 'GERALDINE_COST', 'HOAYBEE', 'HOAYBEE_HR', 'HOAYBEE_COST', 'JASON', 'JASON_HR', 'JASON_COST', 'JOSEPH', 'JOSEPH_HR', 'JOSEPH_COST', 'RAYMOND', 'RAYMOND_HR', 'RAYMOND_COST', 'SEETLEE', 'SEETLEE_HR', 'SEETLEE_COST', 'SERENE', 'SERENE_HR', 'SERENE_COST', 'SIEWJIN', 'SIEWJIN_HR', 'SIEWJIN_COST']




Unnamed: 0,PROJ,FEE,PROD,ACCS,TOTAL_HR,ANGELA,ANGELA_HR,ANGELA_COST,CAIYEAN,CAIYEAN_HR,...,SEETLEE_COST,SERENE,SERENE_HR,SERENE_COST,SIEWJIN,SIEWJIN_HR,SIEWJIN_COST,TOTAL_COST,TOTAL_PROD,TOTAL_ACC
0,3H aircon,3900,0.0,"ANGELA, CAIYEAN, SEETLEE",71.0,1,29.5,442.5,1,35.5,...,210.0,0,0,0,0,0.0,0.0,1895.0,54.929577,3
3,Active Links Airconditioning Engrg,2900,0.0,"ANGELA, SEETLEE, SIEWJIN",12.0,1,2.5,37.5,0,0.0,...,210.0,0,0,0,1,3.5,87.5,335.0,241.666667,3
6,ADCT Technologies,6500,0.0,"RAYMOND, SEETLEE, JOSEPH, SIEWJIN",89.0,0,0.0,0.0,0,0.0,...,402.5,0,0,0,1,19.5,487.5,1760.0,73.033708,4
10,Additions Lifestyle,2900,71.6,"CAIYEAN, ANGELA, RAYMOND",83.5,1,31.0,465.0,1,27.0,...,0.0,0,0,0,0,0.0,0.0,1792.5,34.730539,3
13,Aidha Ltd,1900,38.62,"HOAYBEE, SEETLEE, JASON",75.0,0,0.0,0.0,0,0.0,...,157.5,0,0,0,0,0.0,0.0,1690.0,25.333333,3


In [9]:
result.drop(accts_extended, axis=1).head(15) ## with less rows

Unnamed: 0,PROJ,FEE,PROD,ACCS,TOTAL_HR,TOTAL_COST,TOTAL_PROD,TOTAL_ACC
0,3H aircon,3900,0.0,"ANGELA, CAIYEAN, SEETLEE",71.0,1895.0,54.929577,3
3,Active Links Airconditioning Engrg,2900,0.0,"ANGELA, SEETLEE, SIEWJIN",12.0,335.0,241.666667,3
6,ADCT Technologies,6500,0.0,"RAYMOND, SEETLEE, JOSEPH, SIEWJIN",89.0,1760.0,73.033708,4
10,Additions Lifestyle,2900,71.6,"CAIYEAN, ANGELA, RAYMOND",83.5,1792.5,34.730539,3
13,Aidha Ltd,1900,38.62,"HOAYBEE, SEETLEE, JASON",75.0,1690.0,25.333333,3
16,Aim Aircon Engineering,3000,0.0,"RAYMOND, CAIYEAN, ANGELA",88.5,2337.5,33.898305,3
19,Aim Fire Systems Engineering,3000,71.6,"CAIYEAN, ANGELA, SEETLEE",57.5,1552.5,52.173913,3
22,Al- us Enterprises,500,86.82,"SEETLEE, JASON, JOSEPH",16.0,260.0,31.25,3
25,Alan Telecom,1200,71.6,"CAIYEAN, HOAYBEE",12.5,337.5,96.0,2
27,Aldon Technologies Services,7500,0.0,"RAYMOND, JOSEPH, SEETLEE, SIEWJIN",302.0,7470.0,24.834437,4


In [10]:
result[result.TOTAL_ACC == 1].drop(accts_extended, axis=1).drop(['TOTAL_COST','TOTAL_HR'],axis=1)

Unnamed: 0,PROJ,FEE,PROD,ACCS,TOTAL_PROD,TOTAL_ACC
39,Bachmann,2800,38.62,HOAYBEE,38.62069,1
57,CGI Ind,4300,0.0,GERALDINE,103.614458,1
58,Changi Sailing Club,4500,0.0,CHWEELIAN,50.0,1
59,Chelliah & Kiang,1200,0.0,SERENE,48.0,1
90,Fong Foundation,4500,67.79,DINA,67.669173,1
100,Innochem,1000,42.85,JASON,42.553191,1
103,Institute of Shipping Management,900,0.0,JOSEPH,40.909091,1
123,Lingfine Jewellery,2200,0.0,GERALDINE,28.75817,1
133,Montrico Realty,900,0.0,JOSEPH,62.068966,1
142,Pace Flow Control,2900,71.6,CAIYEAN,71.604938,1


In [None]:
result.drop(accts_extended, axis=1).describe()

In [None]:
acctt

In [None]:
def get_row(prod):
    return result.drop(accts_extended, axis=1).drop(['ACCS','TOTAL_ACC'], axis=1)[result.TOTAL_PROD == prod]
get_row(760)

In [None]:
prods = sorted(result.TOTAL_PROD.unique(), reverse=True)
print prods

for i in range(10):
    print get_row(prods[i])

# projt.head(10)
# result.head(10)

In [None]:
# http://stackoverflow.com/questions/27298178/concatenate-strings-from-several-rows-using-pandas-groupby

projt['TOTAL_HR'] = projt[['PROJ','HR']].groupby(['PROJ'])['HR'].transform(lambda x: sum(x))

result = projt.drop_duplicates(['PROJ'])
result.drop(['ACC','HR'], axis=1, inplace=True)
print "post row count:", result.shape
result.head(10)

In [None]:
print result.PROJ.iloc[0]

In [None]:
print result['ANGELA'].iloc[0]
print result['ANGELA'].iloc[5] == 1

In [None]:
len(result.ACCS)

In [None]:
print result['PROJ'].loc[1]

In [None]:
result.head()

In [None]:
acctsummary=pd.DataFrame() #create new DataFrame
acctsummary['ACC'] = accts
acctsummary['PROJS']= projt[['PROJ','ACC']].sort_values(by=['ACC','PROJ']).groupby(['ACC'])['PROJ'].transform(lambda x: ', '.join(x[:2]))
# acctsummary['PROJS'] = projt[['ACC','PROJ']].groupby(['ACC'])['PROJ'].transform(lambda x: ', '.join(x))
acctsummary

In [None]:
acctsummary['PROJS']= projt[['PROJ','ACC']].sort_values(by='ACC').groupby(['ACC'])['PROJ'].transform(lambda x: ', '.join(x[:2]))
acctsummary.head(12)
# projt[['ACC','PROJ']].sort_values(by='ACC').head()

In [None]:
projt[['PROJ','ACC']].sort_values(by=['ACC','PROJ'])

In [None]:
# if job accountant count = one, find the accountant, and add the productivity to a list.
# see if need to deconflict
