In [65]:
import pandas as pd
# https://docs.google.com/spreadsheets/d/1OdjccfGlv3lsuiWgIAHbE8id91FpVaU2EsaZo5kknaA/edit?usp=sharing
import numpy as np

In [66]:

from io import BytesIO
import requests
spreadsheet_id = '1OdjccfGlv3lsuiWgIAHbE8id91FpVaU2EsaZo5kknaA'
file_name = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(spreadsheet_id)
r = requests.get(file_name)
train = pd.read_csv(BytesIO(r.content))


In [67]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MMM-YY                19104 non-null  object
 1   Emp_ID                19104 non-null  int64 
 2   Age                   19104 non-null  int64 
 3   Gender                19104 non-null  object
 4   City                  19104 non-null  object
 5   Education_Level       19104 non-null  object
 6   Salary                19104 non-null  int64 
 7   Dateofjoining         19104 non-null  object
 8   LastWorkingDate       1616 non-null   object
 9   Joining Designation   19104 non-null  int64 
 10  Designation           19104 non-null  int64 
 11  Total Business Value  19104 non-null  int64 
 12  Quarterly Rating      19104 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 1.9+ MB


In [68]:
def get_info(df):

    print("Head:")
    display(df.head())
    print()
    
    print("Info:")
    display(df.info())
    print()
    
    print("Description of quantitative parameters:")
    display(df.describe())
    print()
    
    print("Description of categorical parameters:")
    display(df.describe(include='object'))
    print()
    
    print("Columns with NaN values:")
    display(df.isna().sum())
    print()
    
    print("Shape:")
    display(df.shape)
    print()
    
    print("Number of duplicated rows:")
    display(df.duplicated().sum())  

In [69]:
get_info(train)

Head:

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MMM-YY                19104 non-null  object
 1   Emp_ID                19104 non-null  int64 
 2   Age                   19104 non-null  int64 
 3   Gender                19104 non-null  object
 4   City                  19104 non-null  object
 5   Education_Level       19104 non-null  object
 6   Salary                19104 non-null  int64 
 7   Dateofjoining         19104 non-null  object
 8   LastWorkingDate       1616 non-null   object
 9   Joining Designation   19104 non-null  int64 
 10  Designation           19104 non-null  int64 
 11  Total Business Value  19104 non-null  int64 
 12  Quarterly Rating      19104 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 1.9+ MB

Description of quantitative parameters:

Description of categorical parameter

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,2016-01-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,2381060,2
1,2016-02-01,1,28,Male,C23,Master,57387,2015-12-24,,1,1,-665480,2
2,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2
3,2017-11-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1
4,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,,2,2,0,1


None

Unnamed: 0,Emp_ID,Age,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating
count,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0
mean,1415.591133,34.650283,65652.025126,1.690536,2.25267,571662.1,2.008899
std,810.705321,6.264471,30914.515344,0.836984,1.026512,1128312.0,1.009832
min,1.0,21.0,10747.0,1.0,1.0,-6000000.0,1.0
25%,710.0,30.0,42383.0,1.0,1.0,0.0,1.0
50%,1417.0,34.0,60087.0,1.0,2.0,250000.0,2.0
75%,2137.0,39.0,83969.0,2.0,3.0,699700.0,3.0
max,2788.0,58.0,188418.0,5.0,5.0,33747720.0,4.0


Unnamed: 0,MMM-YY,Gender,City,Education_Level,Dateofjoining,LastWorkingDate
count,19104,19104,19104,19104,19104,1616
unique,24,2,29,3,869,493
top,2016-01-01,Male,C20,Bachelor,2012-07-23,2017-07-29
freq,1022,11103,1008,6864,192,70


MMM-YY                      0
Emp_ID                      0
Age                         0
Gender                      0
City                        0
Education_Level             0
Salary                      0
Dateofjoining               0
LastWorkingDate         17488
Joining Designation         0
Designation                 0
Total Business Value        0
Quarterly Rating            0
dtype: int64

(19104, 13)

0

# Processing Dates:
We have added LastMonthOnWork (datetime feature) and MonthsTillLast (int feature)

In [79]:
train['date'] = pd.to_datetime(train['MMM-YY'], format='%Y-%m-%d')
train['LastWorkingDate'] = pd.to_datetime(train['LastWorkingDate'], format='%Y-%m-%d')

print(train.date.min(), train.date.max())
# max_date = pd.datetime.today()  # to adapt easier for other datetime windows
max_date = train.date.max()

# we are adding 3 days before getting month so that dates in the end of the month were rounded to the next month.
# this is done because for persons whose LastWorkingDate is in the last 1-3 days there exists a data entry on the next month
train['LastMonthOnWork'] = (train['LastWorkingDate'].fillna(max_date) + pd.Timedelta(days=3)).dt.to_period('M').dt.to_timestamp()
train['MonthsTillLast'] = ((train.LastMonthOnWork - train.date)/np.timedelta64(1, 'M')).astype(int)

2016-01-01 00:00:00 2017-12-01 00:00:00


In [71]:
get_info(train)

Head:

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   MMM-YY                19104 non-null  object        
 1   Emp_ID                19104 non-null  int64         
 2   Age                   19104 non-null  int64         
 3   Gender                19104 non-null  object        
 4   City                  19104 non-null  object        
 5   Education_Level       19104 non-null  object        
 6   Salary                19104 non-null  int64         
 7   Dateofjoining         19104 non-null  object        
 8   LastWorkingDate       1616 non-null   datetime64[ns]
 9   Joining Designation   19104 non-null  int64         
 10  Designation           19104 non-null  int64         
 11  Total Business Value  19104 non-null  int64         
 12  Quarterly Rating      19104 non-null  int64         
 13  dat

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating,date,LastMonthOnWork,MonthsTillLast
0,2016-01-01,1,28,Male,C23,Master,57387,2015-12-24,NaT,1,1,2381060,2,2016-01-01,2017-12-01,22
1,2016-02-01,1,28,Male,C23,Master,57387,2015-12-24,NaT,1,1,-665480,2,2016-02-01,2017-12-01,21
2,2016-03-01,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2,2016-03-01,2016-03-01,0
3,2017-11-01,2,31,Male,C7,Master,67016,2017-11-06,NaT,2,2,0,1,2017-11-01,2017-12-01,0
4,2017-12-01,2,31,Male,C7,Master,67016,2017-11-06,NaT,2,2,0,1,2017-12-01,2017-12-01,0


None

Unnamed: 0,Emp_ID,Age,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating,MonthsTillLast
count,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0
mean,1415.591133,34.650283,65652.025126,1.690536,2.25267,571662.1,2.008899,10.296273
std,810.705321,6.264471,30914.515344,0.836984,1.026512,1128312.0,1.009832,7.444162
min,1.0,21.0,10747.0,1.0,1.0,-6000000.0,1.0,0.0
25%,710.0,30.0,42383.0,1.0,1.0,0.0,1.0,4.0
50%,1417.0,34.0,60087.0,1.0,2.0,250000.0,2.0,10.0
75%,2137.0,39.0,83969.0,2.0,3.0,699700.0,3.0,17.0
max,2788.0,58.0,188418.0,5.0,5.0,33747720.0,4.0,22.0


Unnamed: 0,MMM-YY,Gender,City,Education_Level,Dateofjoining
count,19104,19104,19104,19104,19104
unique,24,2,29,3,869
top,2016-01-01,Male,C20,Bachelor,2012-07-23
freq,1022,11103,1008,6864,192


MMM-YY                      0
Emp_ID                      0
Age                         0
Gender                      0
City                        0
Education_Level             0
Salary                      0
Dateofjoining               0
LastWorkingDate         17488
Joining Designation         0
Designation                 0
Total Business Value        0
Quarterly Rating            0
date                        0
LastMonthOnWork             0
MonthsTillLast              0
dtype: int64

(19104, 16)

0

# Checking for strange behaviour in dates:

In [72]:
train[train['MonthsTillLast'] == -1].head(20)

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating,date,LastMonthOnWork,MonthsTillLast


In [73]:
print(len(train[train['MonthsTillLast'] == -1]))


0


In [74]:
# train[train['LastWorkingDate'] < train['date']].head(20)

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating,date,LastMonthOnWork,MonthsTillLast
67,2016-05-01,18,27,Female,C17,Bachelor,31631,2016-01-09,2016-04-30,1,1,0,1,2016-05-01,2016-05-01,0
169,2016-12-01,30,31,Male,C6,Bachelor,69457,2016-10-03,2016-11-29,3,3,0,1,2016-12-01,2016-12-01,0
196,2017-08-01,36,41,Female,C18,Bachelor,32865,2016-07-01,2017-07-29,2,2,-325140,2,2017-08-01,2017-08-01,0
253,2016-03-01,46,36,Female,C25,Master,42171,2015-10-28,2016-02-27,1,1,0,1,2016-03-01,2016-03-01,0
579,2017-08-01,82,37,Female,C24,Bachelor,89013,2017-04-11,2017-07-29,3,3,0,1,2017-08-01,2017-08-01,0
645,2017-08-01,89,28,Male,C2,Bachelor,58977,2016-05-30,2017-07-29,2,2,0,1,2017-08-01,2017-08-01,0
690,2017-08-01,99,30,Male,C23,College,38068,2017-03-24,2017-07-29,2,2,0,1,2017-08-01,2017-08-01,0
870,2017-04-01,120,27,Male,C14,Master,114258,2016-09-28,2017-03-29,3,3,0,1,2017-04-01,2017-04-01,0
911,2017-08-01,130,38,Male,C28,Bachelor,39990,2017-01-17,2017-07-29,2,2,0,1,2017-08-01,2017-08-01,0
977,2017-11-01,141,35,Male,C22,Bachelor,65901,2017-07-10,2017-10-31,2,2,0,1,2017-11-01,2017-11-01,0


In [75]:
# print(len(train[train['LastWorkingDate'] < train['date'].dt.to_period('M').dt.to_timestamp()]))

277


In [80]:
date_overflow = train[(train['LastWorkingDate'] < train['date'].dt.to_period('M').dt.to_timestamp())]
# print(date_overflow)

In [81]:
date_diff = date_overflow['LastWorkingDate'] - date_overflow['date']
print(date_diff.min(), date_diff.max())

-3 days +00:00:00 -1 days +00:00:00
