In [27]:
import numpy as np
import pandas as pd
import math

In [5]:
from io import BytesIO
import requests
spreadsheet_id = '1OdjccfGlv3lsuiWgIAHbE8id91FpVaU2EsaZo5kknaA'
file_name = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(spreadsheet_id)
r = requests.get(file_name)
train = pd.read_csv(BytesIO(r.content))

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MMM-YY                19104 non-null  object
 1   Emp_ID                19104 non-null  int64 
 2   Age                   19104 non-null  int64 
 3   Gender                19104 non-null  object
 4   City                  19104 non-null  object
 5   Education_Level       19104 non-null  object
 6   Salary                19104 non-null  int64 
 7   Dateofjoining         19104 non-null  object
 8   LastWorkingDate       1616 non-null   object
 9   Joining Designation   19104 non-null  int64 
 10  Designation           19104 non-null  int64 
 11  Total Business Value  19104 non-null  int64 
 12  Quarterly Rating      19104 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 1.9+ MB


In [7]:
def get_info(df):

    print("Head:")
    display(df.head())
    print()
    
    print("Info:")
    display(df.info())
    print()
    
    print("Description of quantitative parameters:")
    display(df.describe())
    print()
    
    print("Description of categorical parameters:")
    display(df.describe(include='object'))
    print()
    
    print("Columns with NaN values:")
    display(df.isna().sum())
    print()
    
    print("Shape:")
    display(df.shape)
    print()
    
    print("Number of duplicated rows:")
    display(df.duplicated().sum())  

In [8]:
get_info(train)

Head:


Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,2016-01-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,2381060,2
1,2016-02-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,-665480,2
2,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
3,2017-11-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
4,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MMM-YY                19104 non-null  object
 1   Emp_ID                19104 non-null  int64 
 2   Age                   19104 non-null  int64 
 3   Gender                19104 non-null  object
 4   City                  19104 non-null  object
 5   Education_Level       19104 non-null  object
 6   Salary                19104 non-null  int64 
 7   Dateofjoining         19104 non-null  object
 8   LastWorkingDate       1616 non-null   object
 9   Joining Designation   19104 non-null  int64 
 10  Designation           19104 non-null  int64 
 11  Total Business Value  19104 non-null  int64 
 12  Quarterly Rating      19104 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 1.9+ MB


None


Description of quantitative parameters:


Unnamed: 0,Emp_ID,Age,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating
count,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0
mean,1415.591133,34.650283,65652.025126,1.690536,2.25267,571662.1,2.008899
std,810.705321,6.264471,30914.515344,0.836984,1.026512,1128312.0,1.009832
min,1.0,21.0,10747.0,1.0,1.0,-6000000.0,1.0
25%,710.0,30.0,42383.0,1.0,1.0,0.0,1.0
50%,1417.0,34.0,60087.0,1.0,2.0,250000.0,2.0
75%,2137.0,39.0,83969.0,2.0,3.0,699700.0,3.0
max,2788.0,58.0,188418.0,5.0,5.0,33747720.0,4.0



Description of categorical parameters:


Unnamed: 0,MMM-YY,Gender,City,Education_Level,Dateofjoining,LastWorkingDate
count,19104,19104,19104,19104,19104,1616
unique,24,2,29,3,869,493
top,2016-01-01,Male,C20,Bachelor,2012-07-23,2017-07-29
freq,1022,11103,1008,6864,192,70



Columns with NaN values:


MMM-YY                      0
Emp_ID                      0
Age                         0
Gender                      0
City                        0
Education_Level             0
Salary                      0
Dateofjoining               0
LastWorkingDate         17488
Joining Designation         0
Designation                 0
Total Business Value        0
Quarterly Rating            0
dtype: int64


Shape:


(19104, 13)


Number of duplicated rows:


0

In [44]:
ids = set(train['Emp_ID'])
eployee_dfs = [train[train['Emp_ID'] == i] for i in ids]

salaries_fired = {}
salaries_not_fired = {}
for employee_df in eployee_dfs:
    if not employee_df['LastWorkingDate'].isnull().values.all():
        salaries_fired[employee_df.iloc[0]['Emp_ID']] = (employee_df['Salary'].max() - employee_df['Salary'].min()) / employee_df['Salary'].min()
    else:
        salaries_not_fired[employee_df.iloc[0]['Emp_ID']] = (employee_df['Salary'].max() - employee_df['Salary'].min()) / employee_df['Salary'].min()
        
    

In [43]:
print("Percent of raised salaries for fired people: ", len([x for x in salaries_fired if salaries_fired[x] != 0.0]) / len(salaries_fired) * 100)
print("Percent of raised salaries for not fired people: ", len([x for x in salaries_not_fired if salaries_not_fired[x] != 0.0]) / len(salaries_not_fired) * 100)

Percent of raised salaried for fired people:  0.18564356435643564
Percent of raised salaried for not fired people:  5.359477124183006


In [76]:
employee_features = {}
for employee_df in eployee_dfs:
    id = employee_df.iloc[0]['Emp_ID']
    employee_features[id] = {}
    employee_features[id]['Salary Change'] = (employee_df['Salary'].max() - employee_df['Salary'].min()) / employee_df['Salary'].min()
    employee_features[id]['Salary changed'] = employee_features[id]['Salary Change'] != 0
    employee_features[id]['Total Business Value All'] = employee_df['Total Business Value'].sum()
    employee_features[id]['Overvalue'] = (employee_df['Total Business Value'] / employee_df['Salary']).mean()
    employee_features[id]['Fired'] = employee_df['LastWorkingDate'].isnull().values.all()   
    
    
    

In [77]:
employee_features = pd.DataFrame.from_dict(employee_features, orient='index')

In [78]:
employee_features

Unnamed: 0,Salary Change,Salary changed,Total Business Value All,Overvalue,Fired
1,0.0,False,1715580,9.964975,False
2,0.0,False,0,0.000000,True
4,0.0,False,350000,1.067024,False
5,0.0,False,120360,0.865252,False
6,0.0,False,1265000,3.213596,True
...,...,...,...,...,...
2784,0.0,False,21748820,10.942472,True
2785,0.0,False,0,0.000000,False
2786,0.0,False,2815090,8.843307,False
2787,0.0,False,977830,2.344984,False


In [79]:
check_feature_for_fired_not_fired(df, 'Overvalue')

For fired:  7.998073216273239
For not fired:  4.09988720477746


In [80]:
check_feature_for_fired_not_fired(df, 'Total Business Value All')

For fired:  9620626.31372549
For not fired:  2203745.761138614


In [81]:
employee_features[employee_features['Fired'] == True]['Total Business Value All'].mean()

9620626.31372549

In [82]:
def check_feature_for_fired_not_fired(df, feature):
    print('For fired: ', df[df['Fired'] == True][feature].mean())
    print('For not fired: ', df[df['Fired'] == False][feature].mean())    