# Employee Turn Over

Problem Statement :

We would like to predict Employee Turn over based on employee application activity.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
PATH_DATA='../../data/raw/'
PATH_DATA_INTERIM='../../data/iter2/interim/'

In [3]:

def q50(x):
    return x.quantile(0.5)

def q75(x):
    return x.quantile(0.75)

def get_unique_id(df) :
    return  df['employee'].astype(str)+'_'+df['companyAlias']

## Get The data

preview all table

In [4]:
data_churn=pd.read_csv(PATH_DATA+'churn.csv')
data_comment=pd.read_csv(PATH_DATA+'commentInteractions.csv')
data_votes=pd.read_csv(PATH_DATA+'votes.csv')
data_comment_cln0=pd.read_csv(PATH_DATA+'comments_clean_anonimized.csv')

In [5]:
## Happy today?
data_votes.head()

Unnamed: 0,employee,companyAlias,voteDate,vote
0,31,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
1,33,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
2,79,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
3,94,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
4,16,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,2


In [6]:
data_votes.shape

(221232, 4)

In [7]:
data_comment.head()

Unnamed: 0,employee,companyAlias,liked,disliked,commentId
0,307,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
1,36,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
2,276,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
3,24,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
4,382,56aec740f1ef260003e307d6,True,False,58d0179ae010990004e3806d


In [8]:
data_comment.shape

(336960, 5)

In [9]:
data_comment_cln0.head()

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate
0,307,56aec740f1ef260003e307d6,58d018d7e010990004e38070,**********************************************...,4.0,0.0,Mon Mar 20 19:00:17 CET 2017
1,382,56aec740f1ef260003e307d6,58d0179ae010990004e3806d,*****************************,1.0,2.0,Mon Mar 20 18:55:16 CET 2017
2,172,56aec740f1ef260003e307d6,58cff8cde010990004e37f6a,***************************,3.0,0.0,Mon Mar 20 16:44:02 CET 2017
3,135,56aec740f1ef260003e307d6,58cfefeee010990004e37f60,***************************,1.0,1.0,Mon Mar 20 16:06:08 CET 2017
4,225,56aec740f1ef260003e307d6,58cfd9b4e010990004e37f52,*********************************,3.0,2.0,Mon Mar 20 14:30:50 CET 2017


In [10]:
data_comment_cln0.shape

(82756, 7)

In [11]:
data_churn.head()

Unnamed: 0,employee,companyAlias,numVotes,lastParticipationDate,stillExists
0,512,56aec740f1ef260003e307d6,4,Thu Feb 23 12:48:04 CET 2017,True
1,-2,56aec740f1ef260003e307d6,0,Wed Jan 18 14:00:55 CET 2017,False
2,2,56aec740f1ef260003e307d6,72,Fri Mar 17 01:00:00 CET 2017,True
3,487,56aec740f1ef260003e307d6,14,Sat Nov 19 15:02:14 CET 2016,False
4,3,56aec740f1ef260003e307d6,22,Thu Feb 16 01:00:00 CET 2017,True


In [12]:
data_churn.shape

(4847, 5)

## Formulate the solutions

    Each table has different level. Company level, Employee level, Comment level, or Votes level. 
    Since our target to create model that predict Employee turn over, then we must create features/predictor on Employee level.
    Also, we must have employee that used for training and employee for testing.
    
    Lets generate employee_target.
    

In [13]:
## One of data with Employee level information is data_churn which also contains the target of our model.
data_churn.head()

Unnamed: 0,employee,companyAlias,numVotes,lastParticipationDate,stillExists
0,512,56aec740f1ef260003e307d6,4,Thu Feb 23 12:48:04 CET 2017,True
1,-2,56aec740f1ef260003e307d6,0,Wed Jan 18 14:00:55 CET 2017,False
2,2,56aec740f1ef260003e307d6,72,Fri Mar 17 01:00:00 CET 2017,True
3,487,56aec740f1ef260003e307d6,14,Sat Nov 19 15:02:14 CET 2016,False
4,3,56aec740f1ef260003e307d6,22,Thu Feb 16 01:00:00 CET 2017,True


In [14]:
## Employee Id is not unique since different company may have same employee id
len(data_churn.employee.unique())==len(data_churn)

False

In [15]:
data_churn['unique_employee_ids'] = get_unique_id(data_churn)

In [16]:
len(data_churn.unique_employee_ids.unique())==len(data_churn) ## --> still not unique

False

In [17]:
len(data_churn.unique_employee_ids.unique()),len(data_churn)

(4806, 4847)

In [18]:
## for simplicity --> remove the double
employee_data_cnt = data_churn.groupby('unique_employee_ids').size().sort_values()
employee_not_unique = employee_data_cnt[employee_data_cnt>1].index.tolist()
print(len(employee_not_unique))

data_churn2 = data_churn[~(data_churn.unique_employee_ids.isin(employee_not_unique))]

37


In [19]:
len(data_churn2.unique_employee_ids.unique())==len(data_churn2) ## --> unique already.

True

In [20]:
df_employee_target = data_churn2[['unique_employee_ids','stillExists','companyAlias']]
df_employee_target.head()

Unnamed: 0,unique_employee_ids,stillExists,companyAlias
0,512_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6
1,-2_56aec740f1ef260003e307d6,False,56aec740f1ef260003e307d6
2,2_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6
4,3_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6
5,-4_56aec740f1ef260003e307d6,False,56aec740f1ef260003e307d6


In [21]:
train_size = int(len(df_employee_target)*0.7) ##--> train size
print(train_size)

3338


In [22]:
## Get test and generate flag
employee_train = df_employee_target.sample(train_size,random_state = 103)['unique_employee_ids']
df_employee_target['is_train'] = df_employee_target.unique_employee_ids.isin(employee_train).astype(int)

In [23]:
df_employee_target.head()

Unnamed: 0,unique_employee_ids,stillExists,companyAlias,is_train
0,512_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6,0
1,-2_56aec740f1ef260003e307d6,False,56aec740f1ef260003e307d6,1
2,2_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6,0
4,3_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6,0
5,-4_56aec740f1ef260003e307d6,False,56aec740f1ef260003e307d6,1


## Employee Features 1

    use data votes.
    remember --> Central European Time – CET Time Zone / European Central Time (Standard Time)
    remember --> hours only 1 and 2 somehow
    possible features in employee level :

    1. Avg Votes
    2. p50 Votes
    3. p75 Votes
    4. Cnt Votes
    5. Std Votes
    6. Avg Votes Monday
    7. p50 Votes Monday
    8. p75 Votes Monday
    9. Cnt Votes Monday
    10. Std Votes Monday
    11. Avg Votes Friday
    12. p50 Votes Friday
    13. p75 Votes Friday
    14. Cnt Votes Friday
    15. Std Votes Friday



In [24]:
data_votes.head()

Unnamed: 0,employee,companyAlias,voteDate,vote
0,31,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
1,33,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
2,79,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
3,94,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4
4,16,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,2


In [25]:
data_votes['unique_employee_ids'] = get_unique_id(data_votes)
data_votes['voteDate2'] = pd.to_datetime(data_votes['voteDate'])

In [26]:
data_votes['day_name'] = data_votes['voteDate2'].dt.strftime('%a')
data_votes['votes_monday'] = np.where(data_votes['day_name']=='Mon',data_votes['vote'],np.NaN)
data_votes['votes_friday'] = np.where(data_votes['day_name']=='Fri',data_votes['vote'],np.NaN)

In [27]:
data_votes.head()

Unnamed: 0,employee,companyAlias,voteDate,vote,unique_employee_ids,voteDate2,day_name,votes_monday,votes_friday
0,31,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4,31_56aec740f1ef260003e307d6,2016-02-01 01:00:00,Mon,4.0,
1,33,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4,33_56aec740f1ef260003e307d6,2016-02-01 01:00:00,Mon,4.0,
2,79,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4,79_56aec740f1ef260003e307d6,2016-02-01 01:00:00,Mon,4.0,
3,94,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,4,94_56aec740f1ef260003e307d6,2016-02-01 01:00:00,Mon,4.0,
4,16,56aec740f1ef260003e307d6,Mon Feb 01 01:00:00 CET 2016,2,16_56aec740f1ef260003e307d6,2016-02-01 01:00:00,Mon,2.0,


In [28]:
dg1 = data_votes.groupby('unique_employee_ids')

In [29]:
## group metrics
fun1 = ['count', np.mean, np.std, q50, q75]
fun1_dict = {}
col_f1 = ['unique_employee_ids']
for cols in ['vote', 'votes_monday', 'votes_friday']:
    fun1_dict[cols] = fun1
    for met in ['cnt','avg','std','p50','p75']:
        col_f1.append(cols+'_'+met)
        
emp_f1 = dg1.agg(fun1_dict).reset_index()
emp_f1.columns = col_f1

In [30]:
emp_f1.sample(8)

Unnamed: 0,unique_employee_ids,vote_cnt,vote_avg,vote_std,vote_p50,vote_p75,votes_monday_cnt,votes_monday_avg,votes_monday_std,votes_monday_p50,votes_monday_p75,votes_friday_cnt,votes_friday_avg,votes_friday_std,votes_friday_p50,votes_friday_p75
2225,294_54e52607e4b01191dc064966,29,3.241379,0.635563,3.0,4.0,4,3.25,0.5,3.0,3.25,6,3.5,0.83666,4.0,4.0
3618,66_56fd2b64f41c670003f643c8,36,1.166667,0.507093,1.0,1.0,7,1.428571,0.786796,1.0,1.5,4,1.0,0.0,1.0,1.0
390,127_5742d699f839a10003a407d2,28,3.214286,0.629941,3.0,4.0,5,3.2,0.447214,3.0,3.0,2,3.5,0.707107,3.5,3.75
3387,57_567011c035dce00003a07fa4,41,2.707317,0.642024,3.0,3.0,5,3.0,0.0,3.0,3.0,10,2.7,0.674949,3.0,3.0
1023,172_56aec740f1ef260003e307d6,77,3.883117,0.428401,4.0,4.0,13,3.923077,0.27735,4.0,4.0,15,4.0,0.0,4.0,4.0
3248,52_574c5ade56b6300003009965,69,2.565217,1.035804,3.0,3.0,10,1.9,1.197219,1.0,3.0,14,2.785714,0.892582,3.0,3.0
337,122_57bb2f0b3bae540003a8d453,59,3.677966,0.506532,4.0,4.0,9,3.444444,0.527046,3.0,4.0,11,3.636364,0.504525,4.0,4.0
8,100_574c5ade56b6300003009965,66,2.515152,0.769464,3.0,3.0,12,2.833333,0.717741,3.0,3.0,14,2.571429,0.755929,3.0,3.0


## Employee Features 2

    use data comment cln.
    Acpt likes --> likes > 0 
    possible features in employee level :

    1. Avg long text
    2. p50 long text
    3. p75 long text
    4. std long text
    5. cnt text
    6. Avg Acpt likes
    7. p50 Acpt likes
    8. p75 Acpt likes
    9. std Acpt likes
    10. cnt Acpt dislikes
    11. Avg Acpt dislikes
    12. p50 Acpt dislikes
    13. p75 Acpt dislikes
    14. std Acpt dislikes
    15. cnt Acpt dislikes
    16. cnt morning comment
    17. cnt night comment
    18. cnt lunch comment
    19. cnt weekend comment
    20. cnt weekday comment


In [31]:
data_comment_cln0[data_comment_cln0.commentDate.isna()]

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate
82755,361,58a728a0e75bda00042a3468,58adb214b8242400048bf8a6,,,,


In [32]:
data_comment_cln1 = data_comment_cln0[~(data_comment_cln0.commentDate.isna())]

In [33]:
data_comment_cln1.head()

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate
0,307,56aec740f1ef260003e307d6,58d018d7e010990004e38070,**********************************************...,4.0,0.0,Mon Mar 20 19:00:17 CET 2017
1,382,56aec740f1ef260003e307d6,58d0179ae010990004e3806d,*****************************,1.0,2.0,Mon Mar 20 18:55:16 CET 2017
2,172,56aec740f1ef260003e307d6,58cff8cde010990004e37f6a,***************************,3.0,0.0,Mon Mar 20 16:44:02 CET 2017
3,135,56aec740f1ef260003e307d6,58cfefeee010990004e37f60,***************************,1.0,1.0,Mon Mar 20 16:06:08 CET 2017
4,225,56aec740f1ef260003e307d6,58cfd9b4e010990004e37f52,*********************************,3.0,2.0,Mon Mar 20 14:30:50 CET 2017


In [34]:
data_comment_cln1['unique_employee_ids'] = get_unique_id(data_comment_cln1)
data_comment_cln1['commentDate2'] = pd.to_datetime(data_comment_cln1['commentDate'])
data_comment_cln1['len_txt'] = data_comment_cln1['txt'].str.len()

In [35]:
# data_comment_cln1[data_comment_cln1.len_txt==0]

In [36]:
data_comment_cln1['is_weekend'] = data_comment_cln1['commentDate2'].dt.strftime('%w').isin(['0','6']).astype(int)
data_comment_cln1['hours'] = data_comment_cln1['commentDate2'].dt.strftime('%H').astype(int)

data_comment_cln1['comment_weekend'] = np.where(data_comment_cln1['is_weekend']==1,data_comment_cln1['txt'],np.NaN)
data_comment_cln1['comment_weekday'] = np.where(data_comment_cln1['is_weekend']==0,data_comment_cln1['txt'],np.NaN)
data_comment_cln1['comment_morning'] = np.where((data_comment_cln1['hours']>=5)&
                                                (data_comment_cln1['hours']<10),data_comment_cln1['txt'],np.NaN)
data_comment_cln1['comment_night'] = np.where((data_comment_cln1['hours']>=19)&
                                                (data_comment_cln1['hours']<=23),data_comment_cln1['txt'],np.NaN)
data_comment_cln1['comment_lunch'] = np.where((data_comment_cln1['hours']>=11)&
                                                (data_comment_cln1['hours']<14),data_comment_cln1['txt'],np.NaN)

data_comment_cln1['acpt_likes'] = np.where(data_comment_cln1.likes>0,data_comment_cln1.likes,np.NaN)
data_comment_cln1['acpt_dislikes'] = np.where(data_comment_cln1.dislikes>0,data_comment_cln1.likes,np.NaN)

In [37]:
data_comment_cln1.sample(8)

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate,unique_employee_ids,commentDate2,len_txt,is_weekend,hours,comment_weekend,comment_weekday,comment_morning,comment_night,comment_lunch,acpt_likes,acpt_dislikes
65110,28,53a2dd43e4b01cc02f1e9011,56f66a59dd0f35000302d147,*****************,0.0,0.0,Sat Mar 26 11:54:15 CET 2016,28_53a2dd43e4b01cc02f1e9011,2016-03-26 11:54:15,17.0,1,11,*****************,,,,*****************,,
62050,181,574c5ade56b6300003009965,579b42ef695129000331b329,**********************************************...,54.0,4.0,Fri Jul 29 13:38:05 CEST 2016,181_574c5ade56b6300003009965,2016-07-29 13:38:05,768.0,0,13,,**********************************************...,,,**********************************************...,54.0,54.0
3611,209,56aec740f1ef260003e307d6,577e1df670596c0003e535f1,**********************************************...,14.0,0.0,Thu Jul 07 11:07:06 CEST 2016,209_56aec740f1ef260003e307d6,2016-07-07 11:07:06,55.0,0,11,,**********************************************...,,,**********************************************...,14.0,
56074,25,56e2a905e3b6fe0003e32855,575062c267d9400003cd3507,**********************************************...,5.0,0.0,Thu Jun 02 18:45:18 CEST 2016,25_56e2a905e3b6fe0003e32855,2016-06-02 18:45:18,55.0,0,18,,**********************************************...,,,,5.0,
37528,113,54e52607e4b01191dc064966,5720f0b8f62d2f000303be36,**********************************************...,9.0,2.0,Wed Apr 27 18:49:09 CEST 2016,113_54e52607e4b01191dc064966,2016-04-27 18:49:09,160.0,0,18,,**********************************************...,,,,9.0,9.0
6239,217,56aec740f1ef260003e307d6,585c281dac0fe00004b24d33,**********************************************...,21.0,0.0,Thu Dec 22 20:22:10 CET 2016,217_56aec740f1ef260003e307d6,2016-12-22 20:22:10,136.0,0,20,,**********************************************...,,**********************************************...,,21.0,
18560,71,5809cc9eff2ea40003fda44d,589c9251ea82600004c79c77,**********************************************...,2.0,6.0,Thu Feb 09 16:59:59 CET 2017,71_5809cc9eff2ea40003fda44d,2017-02-09 16:59:59,87.0,0,16,,**********************************************...,,,,2.0,2.0
79399,6,5809cde3ff2ea40003fda452,587d04085a5d5d00047d355c,**********************************************...,2.0,0.0,Mon Jan 16 18:32:49 CET 2017,6_5809cde3ff2ea40003fda452,2017-01-16 18:32:49,82.0,0,18,,**********************************************...,,,,2.0,


In [38]:
dg2 = data_comment_cln1.groupby('unique_employee_ids')

In [39]:
## group metrics 1
fun2_1 = ['count', np.mean, np.std, q50, q75]
fun2_1_dict = {}
col_f2_1 = ['unique_employee_ids']
for cols in  ['acpt_likes','acpt_dislikes', 'len_txt'] :
    fun2_1_dict[cols] = fun2_1
    for met in ['cnt','avg','std','p50','p75']:
        col_f2_1.append(cols+'_'+met)
        
emp_f2_1 = dg2.agg(fun2_1_dict).reset_index()
emp_f2_1.columns = col_f2_1

In [40]:
## group metrics 1
fun2_2 = ['count']
fun2_2_dict = {}
col_f2_2 = ['unique_employee_ids']
for cols in ['comment_weekend','comment_weekday','comment_morning','comment_night','comment_lunch']:
    fun2_2_dict[cols] = fun2_2
    for met in ['cnt']:
        col_f2_2.append(cols+'_'+met)
        
emp_f2_2 = dg2.agg(fun2_2_dict).reset_index()
emp_f2_2.columns = col_f2_2

In [41]:
emp_f2_1.sample(3)

Unnamed: 0,unique_employee_ids,acpt_likes_cnt,acpt_likes_avg,acpt_likes_std,acpt_likes_p50,acpt_likes_p75,acpt_dislikes_cnt,acpt_dislikes_avg,acpt_dislikes_std,acpt_dislikes_p50,acpt_dislikes_p75,len_txt_cnt,len_txt_avg,len_txt_std,len_txt_p50,len_txt_p75
1774,287_5370af43e4b0cff95558c12a,2,5.0,0.0,5.0,5.0,0,,,,,2,49.0,0.0,49.0,49.0
3138,94_5641f96713664c000332c8cd,6,12.0,7.641989,11.0,18.5,4,12.5,9.814955,12.5,21.0,6,91.666667,76.117453,80.0,156.5
2083,35_57908a2622881200033b34d7,0,,,,,0,,,,,2,15.5,0.707107,15.5,15.75


In [42]:
emp_f2_2.sample(3)

Unnamed: 0,unique_employee_ids,comment_weekend_cnt,comment_weekday_cnt,comment_morning_cnt,comment_night_cnt,comment_lunch_cnt
2485,4_57dd2d6a4018d9000339ca43,0,8,0,6,2
2706,61_56fd2b64f41c670003f643c8,0,32,0,12,8
1596,254_57dd2d6a4018d9000339ca43,0,22,2,16,2


## Employee Features 3

    use data comment
    possible features in employee level :

    1. cnt gv liked
    2. cnt gv disliked
    3. total reaction


In [43]:
data_comment.head()

Unnamed: 0,employee,companyAlias,liked,disliked,commentId
0,307,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
1,36,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
2,276,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
3,24,56aec740f1ef260003e307d6,True,False,58d018d7e010990004e38070
4,382,56aec740f1ef260003e307d6,True,False,58d0179ae010990004e3806d


In [44]:
data_comment['unique_employee_ids'] = get_unique_id(data_comment)

In [45]:
data_comment['gv_liked'] = np.where(data_comment.liked==True,1,0)
data_comment['gv_disliked'] = np.where(data_comment.disliked==True,1,0)

In [46]:
dg3 = data_comment.groupby('unique_employee_ids')

In [47]:
emp_f3 = dg3.agg({'gv_liked':np.sum, 'gv_disliked':np.sum}).reset_index()
emp_f3.columns = ['unique_employee_ids','gv_liked_cnt','gv_disliked_cnt']
emp_f3['reaction_cnt'] = emp_f3['gv_liked_cnt'] + emp_f3['gv_disliked_cnt']

In [48]:
emp_f3.sample(3)

Unnamed: 0,unique_employee_ids,gv_liked_cnt,gv_disliked_cnt,reaction_cnt
424,105_5742d699f839a10003a407d2,62,1,63
957,156_58a728a0e75bda00042a3468,19,1,20
1312,195_574c5ade56b6300003009965,83,10,93


## Company level features

    collective engagement of all employee on a company may impact employee turn over. Such as :
    
    1. Company Turn over rate
    2. Company Avg Votes
    3. Company p50 Votes
    4. Company p75 Votes
    5. Company Cnt Votes
    6. Company Std Votes
    7. comment per employee (mean)
    8. txt len per employee (mean)
    9. comment per employee (p50)
    10. txt len per employee (p50)
    11. liked per employee (mean)
    12. disliked per employee (mean)
    12. liked per employee (p50)
    13. disliked per employee (p50)
    
    this information is obtained from trained data

In [49]:
## Turn over rate
df_employee_target_tr = df_employee_target[df_employee_target.is_train==1]
comp_f1 = (1-df_employee_target_tr.groupby('companyAlias').stillExists.mean()).reset_index()
comp_f1.columns = ['companyAlias','turn_over_rate']

comp_f1.sample(3)

Unnamed: 0,companyAlias,turn_over_rate
15,573f2c4a3517490003ef7710,0.0
6,5641f96713664c000332c8cd,0.139073
21,57bb2f0b3bae540003a8d453,0.055046


In [50]:
data_votes_tr = data_votes[data_votes.unique_employee_ids.isin(employee_train)]

In [51]:
len(data_votes_tr),len(data_votes)

(156281, 221232)

In [52]:
## Votes
dg3 = data_votes_tr.groupby('companyAlias')

fun4 = ['count', np.mean, np.std, q50, q75]
fun4_dict = {}
col_f4 = ['companyAlias']
for cols in ['vote']:
    fun4_dict[cols] = fun4
    for met in ['cnt','avg','std','p50','p75']:
        col_f4.append(cols+'_company_'+met)
        
comp_f2 = dg3.agg(fun4_dict).reset_index()
comp_f2.columns = col_f4

In [53]:
comp_f2.sample(3)

Unnamed: 0,companyAlias,vote_company_cnt,vote_company_avg,vote_company_std,vote_company_p50,vote_company_p75
29,5809cc9eff2ea40003fda44d,927,2.759439,0.978911,3,3
24,57d956302a040a00036a8905,4213,2.997626,0.865611,3,4
32,58a728a0e75bda00042a3468,2539,2.738874,0.877859,3,3


In [54]:
## comment cln1
data_comment_cln1_tr = data_comment_cln1[data_comment_cln1.unique_employee_ids.isin(employee_train)]

dg4_metric = data_comment_cln1_tr.groupby(['companyAlias','unique_employee_ids']).agg({'len_txt':['count',np.sum]}).reset_index()
dg4_metric.columns = ['companyAlias','unique_employee_ids','total_comment','total_comment_len']
comp_f3 = dg4_metric.groupby('companyAlias').mean().reset_index()
comp_f3.columns = ['companyAlias','comment_per_emp_avg','comment_len_per_emp_avg']
comp_f4 = dg4_metric.groupby('companyAlias').quantile(0.5).reset_index()
comp_f4.columns = ['companyAlias','comment_per_emp_p50','comment_len_per_emp_p50']

print(comp_f3.sample(3))
print(comp_f4.sample(3))

                companyAlias  comment_per_emp_avg  comment_len_per_emp_avg
15  5742d699f839a10003a407d2            36.149533              7273.644860
23  57d956302a040a00036a8905            21.264368              2465.908046
25  57dd2d6a4018d9000339ca43            14.603774              1855.075472
                companyAlias  comment_per_emp_p50  comment_len_per_emp_p50
20  57bb2f0b3bae540003a8d453                 14.0                   1058.0
18  57908a2622881200033b34d7                  8.0                    272.0
5   552e2d00e4b066b42fd122ed                 11.0                    219.0


In [55]:
## comment reaction
data_comment_tr = data_comment[data_comment.unique_employee_ids.isin(employee_train)]
dg5_metric = data_comment_tr.groupby(['companyAlias','unique_employee_ids']).agg({'gv_liked':[np.sum],'gv_disliked':[np.sum]}).reset_index()
dg5_metric.columns = ['companyAlias','unique_employee_ids','gv_liked_exc_cnt','gv_disliked_exc_cnt']

comp_f5 = dg5_metric.groupby('companyAlias').mean().reset_index()
comp_f5.columns = ['companyAlias','gv_liked_per_emp_avg','gv_disliked_per_emp_avg']
comp_f6 = dg5_metric.groupby('companyAlias').quantile(0.5).reset_index()
comp_f6.columns = ['companyAlias','gv_liked_per_emp_p50','gv_disliked_per_emp_p50']

## Merge Features

In [56]:
empl_features = [emp_f1, emp_f2_1, emp_f2_2, emp_f3]
comp_features = [comp_f1, comp_f2, comp_f3, comp_f4, comp_f5, comp_f6]

# all_df_features = empl_features + comp_features

In [57]:
df_interim = df_employee_target.copy()
for d in empl_features:
    df_interim = pd.merge(df_interim, d, on = 'unique_employee_ids', how = 'left')
                          
for d in comp_features:
    df_interim = pd.merge(df_interim, d, on = 'companyAlias', how = 'left')

In [58]:
df_interim

Unnamed: 0,unique_employee_ids,stillExists,companyAlias,is_train,vote_cnt,vote_avg,vote_std,vote_p50,vote_p75,votes_monday_cnt,...,vote_company_p50,vote_company_p75,comment_per_emp_avg,comment_len_per_emp_avg,comment_per_emp_p50,comment_len_per_emp_p50,gv_liked_per_emp_avg,gv_disliked_per_emp_avg,gv_liked_per_emp_p50,gv_disliked_per_emp_p50
0,512_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6,0,4.0,2.500000,1.290994,2.5,3.25,1.0,...,4,4,40.497238,3133.790055,24.0,1522.0,105.754902,30.171569,35.5,4.0
1,-2_56aec740f1ef260003e307d6,False,56aec740f1ef260003e307d6,1,,,,,,,...,4,4,40.497238,3133.790055,24.0,1522.0,105.754902,30.171569,35.5,4.0
2,2_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6,0,72.0,2.250000,1.031203,2.0,3.00,13.0,...,4,4,40.497238,3133.790055,24.0,1522.0,105.754902,30.171569,35.5,4.0
3,3_56aec740f1ef260003e307d6,True,56aec740f1ef260003e307d6,0,22.0,3.454545,0.738549,4.0,4.00,2.0,...,4,4,40.497238,3133.790055,24.0,1522.0,105.754902,30.171569,35.5,4.0
4,-4_56aec740f1ef260003e307d6,False,56aec740f1ef260003e307d6,1,,,,,,,...,4,4,40.497238,3133.790055,24.0,1522.0,105.754902,30.171569,35.5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4764,857_58a728a0e75bda00042a3468,True,58a728a0e75bda00042a3468,1,1.0,3.000000,,3.0,3.00,0.0,...,3,3,9.088889,931.844444,6.5,360.0,35.469027,10.668142,19.5,4.0
4765,858_58a728a0e75bda00042a3468,True,58a728a0e75bda00042a3468,0,1.0,3.000000,,3.0,3.00,0.0,...,3,3,9.088889,931.844444,6.5,360.0,35.469027,10.668142,19.5,4.0
4766,859_58a728a0e75bda00042a3468,True,58a728a0e75bda00042a3468,1,1.0,4.000000,,4.0,4.00,0.0,...,3,3,9.088889,931.844444,6.5,360.0,35.469027,10.668142,19.5,4.0
4767,17_573a0671b5ec330003add34a,True,573a0671b5ec330003add34a,1,7.0,3.571429,0.534522,4.0,4.00,2.0,...,4,4,,,,,,,,


In [59]:
df_interim.to_csv(PATH_DATA_INTERIM+'iter1_interim.csv',index = False)