Author: Kevin ALBERT

Created: Feb 2021

In [1]:
import pandas as pd

In [2]:
# pd.describe_option('display')            # show all pandas options, parameters can slow down notebook
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view

In [3]:
!conda -V
!python -V
!conda list |grep pandas

conda 4.9.2
Python 3.8.6
pandas                    1.2.1            py38h51da96c_0    conda-forge/label/main
pandas-profiling          2.10.0             pyhd8ed1ab_0    conda-forge/label/main


## load data

In [7]:
synthetic_df = pd.read_csv("../../neo4j/import/synthetic_data_processed.csv")
sentiment1_df = pd.read_csv("../../neo4j/import/sentiment1.csv")
sentiment2_df = pd.read_csv("../../neo4j/import/sentiment2.csv")
sentiment3_df = pd.read_csv("../../neo4j/import/sentiment3.csv")

## summary

In [20]:
# concise summary (shape, memory use, data types, nan's)
synthetic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                                                                            Non-Null Count  Dtype  
---  ------                                                                            --------------  -----  
 0   email                                                                             100 non-null    object 
 1   full_name                                                                         100 non-null    object 
 2   how_was_your_last_month_in_your_assignment                                        100 non-null    int64  
 3   How_Was_Your_Last_Month_Within_Your_Department                                    100 non-null    int64  
 4   How_Was_Your_Last_Month_With_Us                                                   100 non-null    int64  
 5   What_Felt_Best_During_This_Last_Month                                             100 non-null    object 
 6   Wha

## data checks

In [21]:
# count distinct (=unique) observations (+ missing), sorted (high cardinality > 390)
synthetic_df.nunique(dropna=False).sort_values(ascending=False)

email                                                                               100
id                                                                                  100
full_name                                                                            91
What_Felt_Best_During_This_Last_Month                                                38
What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction     38
Any_Additional_Suggestion_To_Improve                                                 16
how_was_your_last_month_in_your_assignment                                            4
How_Was_Your_Last_Month_Within_Your_Department                                        4
How_Was_Your_Last_Month_With_Us                                                       4
improve_sentiment                                                                     3
feeling_sentiment                                                                     3
satisfaction_sentiment          

In [22]:
# count missing values, sorted (high missing > 5%)
synthetic_df.isnull().apply(lambda x: x.sum() * 100 / len(synthetic_df)).round(1).sort_values(ascending=True)

email                                                                                 0.0
full_name                                                                             0.0
how_was_your_last_month_in_your_assignment                                            0.0
How_Was_Your_Last_Month_Within_Your_Department                                        0.0
How_Was_Your_Last_Month_With_Us                                                       0.0
What_Felt_Best_During_This_Last_Month                                                 0.0
What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction      0.0
Any_Additional_Suggestion_To_Improve                                                  0.0
id                                                                                    0.0
improve_sentiment                                                                     0.0
feeling_sentiment                                                                     0.0
satisfacti

In [11]:
synthetic_df

Unnamed: 0,email,full_name,how_was_your_last_month_in_your_assignment,How_Was_Your_Last_Month_Within_Your_Department,How_Was_Your_Last_Month_With_Us,What_Felt_Best_During_This_Last_Month,What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction,Any_Additional_Suggestion_To_Improve,any_additional_comments,I_Would_Like_To_Get_Called_By,I_Would_Like_To_Provide_A_Copy_Of_My_Answers_To,id
0,Lorem.ipsum@congueelit.ca,Harrison,4,4,4,"collaborating closely with Damien, closing the deal with ucb","keep the good vibes, the trust and open communication , we rock !",more sync between the different departments,,,,0
1,elit.elit.fermentum@Crasinterdum.ca,Clinton,4,4,4,being promoted,maintain the good collaboration,keep up the good energy,,,,1
2,semper@mi.com,Scott,3,3,2,team building we had last month,"more consideration for the employes, making them partners in management decision",Don’t be friends during work hours.,,,,2
3,tellus.non@velsapien.co.uk,Russell,1,1,1,not much for me,I don't feel that my skills and capacities are taking into accounts,If a few managers took a chill pill.,,,,3
4,vitae@nisinibhlacinia.ca,Kane,4,4,3,getting to work with you guys is a real pleasure,less micro management,I would like to talk to the managers just to see where I am in the company. And let me dye my ha...,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...
95,erat.vel@egestasrhoncusProin.edu,Victor,4,4,2,No Feeling,No Satisfaction,No Improvement,,,,95
96,In@vitaeorci.net,Gareth,2,2,1,No Feeling,No Satisfaction,No Improvement,,,,96
97,elementum@necurnaet.ca,Avram,4,4,4,No Feeling,No Satisfaction,No Improvement,,,,97
98,quis.pede.Suspendisse@risus.edu,Coby,4,4,4,No Feeling,No Satisfaction,No Improvement,,,,98


In [13]:
sentiment1_df

Unnamed: 0,document,sentiment
0,more sync between the different departments,neutral
1,keep up the good energy,positive
2,Don’t be friends during work hours.,negative
3,If a few managers took a chill pill.,neutral
4,I would like to talk to the managers just to see where I am in the company. And let me dye my ha...,positive
...,...,...
95,none,neutral
96,none,neutral
97,none,neutral
98,none,neutral


In [14]:
sentiment2_df

Unnamed: 0,document,sentiment
0,"collaborating closely with Damien, closing the deal with ucb",neutral
1,being promoted,neutral
2,team building we had last month,neutral
3,not much for me,negative
4,getting to work with you guys is a real pleasure,positive
...,...,...
95,none,neutral
96,none,neutral
97,none,neutral
98,none,neutral


In [15]:
sentiment3_df

Unnamed: 0,document,sentiment
0,"keep the good vibes, the trust and open communication , we rock !",positive
1,maintain the good collaboration,positive
2,"more consideration for the employes, making them partners in management decision",neutral
3,I don't feel that my skills and capacities are taking into accounts,negative
4,less micro management,neutral
...,...,...
95,none,neutral
96,none,neutral
97,none,neutral
98,none,neutral


## data cleaning

In [12]:
import sys  
sys.path.insert(0, '../scripts')
import blobstorage
from io import BytesIO

#### The idea is to create a prediction feature, based on the 'sentiment' of the text

In [17]:
synthetic_df['improve_sentiment'] = sentiment1_df["sentiment"]
synthetic_df['feeling_sentiment'] = sentiment2_df["sentiment"]
synthetic_df['satisfaction_sentiment'] = sentiment3_df["sentiment"]

## save new dataset

In [19]:
# store to cloud datalake as *.parquet (preserve schema)
blobstorage.write_blob("silver/synthetic_data_processed.parquet", synthetic_df)

# store local as *.csv files
synthetic_df.to_csv("../../data/silver/synthetic_data_processed.csv", sep=',', index=False)
# store local as *.parquet
synthetic_df.to_parquet("../../data/silver/synthetic_data_processed.parquet")