Author: Kevin ALBERT

Created: Feb 2021

In [1]:
import pandas as pd

In [2]:
# pd.describe_option('display')            # show all pandas options, parameters can slow down notebook
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view

In [3]:
!conda -V
!python -V
!conda list |grep pandas

conda 4.9.2
Python 3.8.6
pandas                    1.2.1            py38h51da96c_0    conda-forge/label/main
pandas-profiling          2.10.0             pyhd8ed1ab_0    conda-forge/label/main


## load data

In [4]:
synthetic_df = pd.read_csv("../../data/bronze/sdworxHRhackathon2021.csv")

## summary

In [5]:
# concise summary (shape, memory use, data types, nan's)
synthetic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                                                                            Non-Null Count  Dtype  
---  ------                                                                            --------------  -----  
 0   email                                                                             100 non-null    object 
 1   full_name                                                                         100 non-null    object 
 2   how_was_your_last_month_in_your_assignment                                        100 non-null    int64  
 3   How_Was_Your_Last_Month_Within_Your_Department                                    100 non-null    int64  
 4   How_Was_Your_Last_Month_With_Us                                                   100 non-null    int64  
 5   What_Felt_Best_During_This_Last_Month                                             37 non-null     object 
 6   Wha

## data checks

In [6]:
# count distinct (=unique) observations (+ missing), sorted (high cardinality > 390)
synthetic_df.nunique(dropna=False).sort_values(ascending=False)

email                                                                               100
full_name                                                                            91
What_Felt_Best_During_This_Last_Month                                                38
What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction     38
Any_Additional_Suggestion_To_Improve                                                 16
how_was_your_last_month_in_your_assignment                                            4
How_Was_Your_Last_Month_Within_Your_Department                                        4
How_Was_Your_Last_Month_With_Us                                                       4
any_additional_comments                                                               1
I_Would_Like_To_Get_Called_By                                                         1
I_Would_Like_To_Provide_A_Copy_Of_My_Answers_To                                       1
dtype: int64

In [7]:
# count missing values, sorted (high missing > 5%)
synthetic_df.isnull().apply(lambda x: x.sum() * 100 / len(synthetic_df)).round(1).sort_values(ascending=True)

email                                                                                 0.0
full_name                                                                             0.0
how_was_your_last_month_in_your_assignment                                            0.0
How_Was_Your_Last_Month_Within_Your_Department                                        0.0
How_Was_Your_Last_Month_With_Us                                                       0.0
What_Felt_Best_During_This_Last_Month                                                63.0
What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction     63.0
Any_Additional_Suggestion_To_Improve                                                 85.0
any_additional_comments                                                             100.0
I_Would_Like_To_Get_Called_By                                                       100.0
I_Would_Like_To_Provide_A_Copy_Of_My_Answers_To                                     100.0
dtype: flo

In [8]:
synthetic_df

Unnamed: 0,email,full_name,how_was_your_last_month_in_your_assignment,How_Was_Your_Last_Month_Within_Your_Department,How_Was_Your_Last_Month_With_Us,What_Felt_Best_During_This_Last_Month,What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction,Any_Additional_Suggestion_To_Improve,any_additional_comments,I_Would_Like_To_Get_Called_By,I_Would_Like_To_Provide_A_Copy_Of_My_Answers_To
0,Lorem.ipsum@congueelit.ca,Harrison,4,4,4,"collaborating closely with Damien, closing the deal with ucb","keep the good vibes, the trust and open communication , we rock !",more sync between the different departments,,,
1,elit.elit.fermentum@Crasinterdum.ca,Clinton,4,4,4,being promoted,maintain the good collaboration,keep up the good energy,,,
2,semper@mi.com,Scott,3,3,2,team building we had last month,"more consideration for the employes, making them partners in management decision",Don’t be friends during work hours.,,,
3,tellus.non@velsapien.co.uk,Russell,1,1,1,not much for me,I don't feel that my skills and capacities are taking into accounts,If a few managers took a chill pill.,,,
4,vitae@nisinibhlacinia.ca,Kane,4,4,3,getting to work with you guys is a real pleasure,less micro management,I would like to talk to the managers just to see where I am in the company. And let me dye my ha...,,,
...,...,...,...,...,...,...,...,...,...,...,...
95,erat.vel@egestasrhoncusProin.edu,Victor,4,4,2,,,,,,
96,In@vitaeorci.net,Gareth,2,2,1,,,,,,
97,elementum@necurnaet.ca,Avram,4,4,4,,,,,,
98,quis.pede.Suspendisse@risus.edu,Coby,4,4,4,,,,,,


## data cleaning

In [9]:
import sys  
sys.path.insert(0, '../scripts')
import blobstorage
from io import BytesIO

## save new dataset

In [10]:
# store to cloud datalake as *.parquet (preserve schema)
blobstorage.write_blob("silver/synthetic_data_processed.parquet", synthetic_df)

# store local as *.csv files
synthetic_df.to_csv("../../data/silver/synthetic_data_processed.csv", sep=',', index=False)
# store local as *.parquet
synthetic_df.to_parquet("../../data/silver/synthetic_data_processed.parquet")