Author: Kevin ALBERT

Created: Feb 2021

In [None]:
# !pip install pyarrow fastparquet

In [1]:
# current python SDK versions:
!pip freeze | grep azure-ai-textanalytics

azure-ai-textanalytics==5.0.0


Generated API keys and endpoints

In [2]:
azure_textanalytics_key = '2180035c81284813a4d457640c87f017'
azure_textanalytics_endpoint = 'https://westeurope.api.cognitive.microsoft.com/'
azure_textanalytics_url = azure_textanalytics_endpoint + 'text/analytics/v3.0'

In [3]:
import warnings
warnings.filterwarnings("ignore")
import os
import requests
import json
import time
import uuid
import glob
from io import BytesIO
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

In [4]:
from azure.ai.textanalytics import TextAnalyticsClient         # for text analytics
from azure.core.credentials import AzureKeyCredential          # for text analytics

API docs: https://westeurope.dev.cognitive.microsoft.com/docs/services/TextAnalytics-v3-0/operations/Languages

In [5]:
client = TextAnalyticsClient(azure_textanalytics_endpoint, AzureKeyCredential(azure_textanalytics_key))

## key_phrases

In [None]:
# documents = ["My cat might need to see a veterinarian."]
documents = ["I had a wonderful trip to Seattle last week.",
             "This was a waste of my time. The speaker put me to sleep.",
             "I don't feel that my skills and capacities are taking into account."]

In [None]:
keyphrases_response = client.extract_key_phrases(documents)
[i.__dict__ for i in keyphrases_response]

In [None]:
for idx, doc in enumerate(keyphrases_response):
    print(idx, doc)

In [None]:
# showing the last line, this is a list
doc.key_phrases

## sentiment

In [None]:
# documents = ["Dit is een heel mooie show.",
#              "This was a waste of my time. The speaker put me to sleep."]
documents = ["I had a wonderful trip to Seattle last week.",
             "This was a waste of my time. The speaker put me to sleep.",
             "I don't feel that my skills and capacities are taking into account."]

In [None]:
sentiment_response = client.analyze_sentiment(documents)
[i.__dict__ for i in sentiment_response]

In [None]:
for idx, doc in enumerate(sentiment_response):
    print(idx, doc)

In [None]:
doc.sentiment

In [None]:
doc.sentences[0]

## named_entities

In [None]:
documents = ["I had a wonderful trip to Seattle last week.",
             "This was a waste of my time. The speaker put me to sleep.",
             "I don't feel that my skills and capacities are taking into account."]

In [None]:
response = client.recognize_entities(documents)
[i.__dict__ for i in response]

save to *.csv

## load data

We need to run this for 3 columns ! (repeat this process 3x)
 1. Any_Additional_Suggestion_To_Improve
 1. What_Felt_Best_During_This_Last_Month  
 1. What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction
 
don't forget to remove null values, remove old csv files and rename the final datasets 

In [6]:
synthetic_df = pd.read_csv("../../data/bronze/sdworxHRhackathon2021.csv")

In [10]:
synthetic_df.head(2).T

Unnamed: 0,0,1
email,Lorem.ipsum@congueelit.ca,elit.elit.fermentum@Crasinterdum.ca
full_name,Harrison,Clinton
how_was_your_last_month_in_your_assignment,4,4
How_Was_Your_Last_Month_Within_Your_Department,4,4
How_Was_Your_Last_Month_With_Us,4,4
What_Felt_Best_During_This_Last_Month,"collaborating closely with Damien, closing th...",being promoted
What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction,"keep the good vibes, the trust and open commun...",maintain the good collaboration
Any_Additional_Suggestion_To_Improve,more sync between the different departments,keep up the good energy
any_additional_comments,,
I_Would_Like_To_Get_Called_By,,


In [35]:
# here you replace the columns 3x as well:
df = pd.DataFrame({'col1': synthetic_df["What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction"].values})
# replace nan
df['col1'] = df['col1'].fillna('none')
df

Unnamed: 0,col1
0,"keep the good vibes, the trust and open commun..."
1,maintain the good collaboration
2,"more consideration for the employes, making th..."
3,I don't feel that my skills and capacities are...
4,less micro management
...,...
95,none
96,none
97,none
98,none


#### generating entities

In [36]:
# replace 'key' and 'endpoint':
azure_textanalytics_key = '2180035c81284813a4d457640c87f017'
azure_textanalytics_endpoint = 'https://westeurope.api.cognitive.microsoft.com/'

import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
client = TextAnalyticsClient(azure_textanalytics_endpoint, AzureKeyCredential(azure_textanalytics_key))

result = pd.DataFrame()
temp   = []
start  = 0      # index id start (ex: 0)
stop   = 100   # index id stop  (ex: 5000)
step   = 1      # max 5 (ex: 5)
column = "col1" # from dataframe
# https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/concepts/data-limits

for i in range(start, stop, step): # 0-5000, 5000-10000, etc... (~10min)
    documents = (df[column].iloc[i:i+step]).tolist()
    response = client.recognize_entities(documents)
    print(i, end='\r') # counter
    for idx, doc in enumerate(response):
        comment = pd.DataFrame()
        for idy, entity in enumerate(doc.entities):
            comment.loc[idy, "document"] = str(documents[idx])
            comment.loc[idy, "entities"] = str(entity)
            comment.loc[idy, "entity_text"] = str(entity.text)
            comment.loc[idy, "entity_category"] = str(entity.category)
            comment.loc[idy, "entity_subcategory"] = str(entity.subcategory)
            comment.loc[idy, "entity_confidence_score"] = str(entity.confidence_score)
        temp.append(comment)
        
result = pd.concat(temp, axis=0, ignore_index=True)
result.to_csv("entities_"+str(start)+"_"+str(stop)+".csv", index=False)
result

99

Unnamed: 0,document,entities,entity_text,entity_category,entity_subcategory,entity_confidence_score
0,"keep the good vibes, the trust and open commun...","{'text': 'communication', 'category': 'Skill',...",communication,Skill,,0.8
1,maintain the good collaboration,"{'text': 'collaboration', 'category': 'Skill',...",collaboration,Skill,,0.8
2,"more consideration for the employes, making th...","{'text': 'management decision', 'category': 'S...",management decision,Skill,,0.8
3,less micro management,"{'text': 'micro management', 'category': 'Skil...",micro management,Skill,,0.8
4,valuate all staff the same way,"{'text': 'valuate', 'category': 'Skill', 'subc...",valuate,Skill,,0.8
5,more transparency about the financial situatio...,"{'text': 'financial', 'category': 'Skill', 'su...",financial,Skill,,0.8
6,making the employees in the heart of managemen...,"{'text': 'management', 'category': 'Skill', 's...",management,Skill,,0.8
7,more openness and transparency,"{'text': 'openness', 'category': 'Skill', 'sub...",openness,Skill,,0.8
8,more training,"{'text': 'training', 'category': 'Skill', 'sub...",training,Skill,,0.8
9,offer more opportunities for continuous learning,"{'text': 'continuous learning', 'category': 'S...",continuous learning,Skill,,0.8


In [37]:
# Merge it all together into one csv
import glob

all_files = glob.glob("entities_*.csv")
li = []
for filename in all_files:
    dataset = pd.read_csv(filename)
    li.append(dataset)
result = pd.concat(li, axis=0, ignore_index=True)
result.to_parquet("../../data/bronze/entities_What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction.parquet")
result

Unnamed: 0,document,entities,entity_text,entity_category,entity_subcategory,entity_confidence_score
0,"keep the good vibes, the trust and open commun...","{'text': 'communication', 'category': 'Skill',...",communication,Skill,,0.8
1,maintain the good collaboration,"{'text': 'collaboration', 'category': 'Skill',...",collaboration,Skill,,0.8
2,"more consideration for the employes, making th...","{'text': 'management decision', 'category': 'S...",management decision,Skill,,0.8
3,less micro management,"{'text': 'micro management', 'category': 'Skil...",micro management,Skill,,0.8
4,valuate all staff the same way,"{'text': 'valuate', 'category': 'Skill', 'subc...",valuate,Skill,,0.8
5,more transparency about the financial situatio...,"{'text': 'financial', 'category': 'Skill', 'su...",financial,Skill,,0.8
6,making the employees in the heart of managemen...,"{'text': 'management', 'category': 'Skill', 's...",management,Skill,,0.8
7,more openness and transparency,"{'text': 'openness', 'category': 'Skill', 'sub...",openness,Skill,,0.8
8,more training,"{'text': 'training', 'category': 'Skill', 'sub...",training,Skill,,0.8
9,offer more opportunities for continuous learning,"{'text': 'continuous learning', 'category': 'S...",continuous learning,Skill,,0.8


#### generating sentiment

In [38]:
# replace 'key' and 'endpoint':
azure_textanalytics_key = '2180035c81284813a4d457640c87f017'
azure_textanalytics_endpoint = 'https://westeurope.api.cognitive.microsoft.com/'

import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
client = TextAnalyticsClient(azure_textanalytics_endpoint, AzureKeyCredential(azure_textanalytics_key))

result = pd.DataFrame()
temp   = []
start  = 0      # index id start (ex: 0)
stop   = 100   # index id stop  (ex: 5000)
step   = 1      # max 5 (ex: 5)
column = "col1" # from dataframe
# https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/concepts/data-limits

for i in range(start, stop, step): # 0-5000, 5000-10000, etc... (~10min)
    documents = (df[column].iloc[i:i+step]).tolist()

    sentiment_response = client.analyze_sentiment(documents)
    print(i, end='\r') # second counter
    for idx, doc in enumerate(sentiment_response):
        comment = pd.DataFrame()
        comment.loc[idy, "document"] = str(documents[idx])
        comment.loc[idy, "sentiment"] = str(doc.sentiment)
    temp.append(comment)

result = pd.concat(temp, axis=0, ignore_index=True)
result.to_csv("sentiment_"+str(start)+"_"+str(stop)+".csv", index=False)
result

99

Unnamed: 0,document,sentiment
0,"keep the good vibes, the trust and open commun...",positive
1,maintain the good collaboration,positive
2,"more consideration for the employes, making th...",neutral
3,I don't feel that my skills and capacities are...,negative
4,less micro management,neutral
...,...,...
95,none,neutral
96,none,neutral
97,none,neutral
98,none,neutral


In [39]:
# Merge it all together into one csv
import glob

all_files = glob.glob("sentiment_*.csv")
li = []
for filename in all_files:
    dataset = pd.read_csv(filename)
    li.append(dataset)
result = pd.concat(li, axis=0, ignore_index=True)
result.to_parquet("../../data/bronze/sentiment_What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction.parquet", index=False)
result

Unnamed: 0,document,sentiment
0,"keep the good vibes, the trust and open commun...",positive
1,maintain the good collaboration,positive
2,"more consideration for the employes, making th...",neutral
3,I don't feel that my skills and capacities are...,negative
4,less micro management,neutral
...,...,...
95,none,neutral
96,none,neutral
97,none,neutral
98,none,neutral


#### generating keyphrases

In [40]:
# replace 'key' and 'endpoint':
azure_textanalytics_key = '2180035c81284813a4d457640c87f017'
azure_textanalytics_endpoint = 'https://westeurope.api.cognitive.microsoft.com/'

import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
client = TextAnalyticsClient(azure_textanalytics_endpoint, AzureKeyCredential(azure_textanalytics_key))

result = pd.DataFrame()
temp   = []
start  = 0      # index id start (ex: 0)
stop   = 100   # index id stop  (ex: 5000)
step   = 1      # max 5 (ex: 5)
column = "col1" # from dataframe
# https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/concepts/data-limits

for i in range(start, stop, step): # 0-5000, 5000-10000, etc... (~10min)
    documents = (df[column].iloc[i:i+step]).tolist()

    keyphrases_response = client.extract_key_phrases(documents)
    print(i, end='\r') # second counter
    for idx, doc in enumerate(keyphrases_response):
        comment = pd.DataFrame()
        comment.loc[idy, "document"] = str(documents[idx])
        comment.loc[idy, "key_phrases"] = str(doc.key_phrases)
    temp.append(comment)

result = pd.concat(temp, axis=0, ignore_index=True)
result.to_csv("key_phrases_"+str(start)+"_"+str(stop)+".csv", index=False)
result

99

Unnamed: 0,document,key_phrases
0,"keep the good vibes, the trust and open commun...","['trust', 'good vibes', 'open communication']"
1,maintain the good collaboration,['good collaboration']
2,"more consideration for the employes, making th...","['employes', 'partners', 'management decision'..."
3,I don't feel that my skills and capacities are...,"['skills', 'capacities', 'accounts']"
4,less micro management,['micro management']
...,...,...
95,none,[]
96,none,[]
97,none,[]
98,none,[]


In [None]:
doc.key_phrases

In [41]:
# Merge it all together into one csv
import glob

all_files = glob.glob("key_phrases_*.csv")
li = []
for filename in all_files:
    dataset = pd.read_csv(filename)
    li.append(dataset)
result = pd.concat(li, axis=0, ignore_index=True)
result.to_parquet("../../data/bronze/key_phrases_What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction.parquet", index=False)
result

Unnamed: 0,document,key_phrases
0,"keep the good vibes, the trust and open commun...","['trust', 'good vibes', 'open communication']"
1,maintain the good collaboration,['good collaboration']
2,"more consideration for the employes, making th...","['employes', 'partners', 'management decision'..."
3,I don't feel that my skills and capacities are...,"['skills', 'capacities', 'accounts']"
4,less micro management,['micro management']
...,...,...
95,none,[]
96,none,[]
97,none,[]
98,none,[]


In [42]:
# we now have saved all the files we need
!ls -al ../../data/bronze/

total 76
drwxr-xr-x 3 ubuntu root   4096 Feb  6 18:49 .
drwxr-xr-x 9 ubuntu root   4096 Feb  5 15:14 ..
drwxrwxr-x 2 ubuntu ubuntu 4096 Feb  5 23:37 .ipynb_checkpoints
-rw-rw-r-- 1 ubuntu ubuntu 6571 Feb  6 18:38 entities_Any_Additional_Suggestion_To_Improve.parquet
-rw-rw-r-- 1 ubuntu ubuntu 6744 Feb  6 18:44 entities_What_Felt_Best_During_This_Last_Month.parquet
-rw-rw-r-- 1 ubuntu ubuntu 7667 Feb  6 18:47 entities_What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction.parquet
-rw-rw-r-- 1 ubuntu ubuntu 3172 Feb  6 18:41 key_phrases_Any_Additional_Suggestion_To_Improve.parquet
-rw-rw-r-- 1 ubuntu ubuntu 3517 Feb  6 18:45 key_phrases_What_Felt_Best_During_This_Last_Month.parquet
-rw-rw-r-- 1 ubuntu ubuntu 5111 Feb  6 18:48 key_phrases_What_Should_We_Do_To_Make_You_Feel_Better_And_Increase_Your_Overall_Satisfaction.parquet
-rw-rw-r-- 1 ubuntu ubuntu 9189 Feb  6 18:19 sdworxHRhackathon2021.csv
-rw-rw-r-- 1 ubuntu ubuntu 2821 Feb  6 18:41 sentiment_An