# Drone Users Workshop

Clean and play around with transcripts . Author: A.Pilko <A.Pilko@soton.ac.uk>

In [1]:
# Install deps
!pip3 install nltk pandas stopwordsiso matplotlib gensim pyLDAvis python-Levenshtein
import nltk
import re
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as mpl
import gensim
import pyLDAvis
import stopwordsiso as swiso
from copy import deepcopy

%matplotlib notebook

pd.set_option('display.max_colwidth', None)

# Why is this not default..
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Set a date otherwise conversion to datetime dtype reverts to current date
date_str = '2021-08-03 '



## Ingest

In [18]:
chat_token_split_regex = re.compile(r'\[.*?\].*?\s*?.\d{2}:\d{2}(?::\d{2})?')
trans_token_split_regex = re.compile(r'\d{2}:\d{2}:\d{2}')

msg_regex = re.compile(r'(?<=]).*?(?=\s+.\d{2}:\d{2}(?::\d{2})?)')
ts_regex = re.compile(r'\s*?\d{1,2}:\d{2}(:\d{2})?')
author_regex = re.compile(r'\[.*?\]')

### Chat

In [3]:
with open('cleaned_chat.txt', 'r') as f:
    raw_chat = f.read()

Tokenise chat messages by regex and extract author and time

In [4]:
# Get individual chat msgs
chat_tokens = chat_token_split_regex.findall(raw_chat)
chat_tokens = [m.strip() for m in chat_tokens if m and m.strip()] # Filter nulls and strip control chars
print(len(chat_tokens), ' chat messages ')
chat_tokens[-10:] # Print first 10 for sanity check

56  chat messages 


['[Drone Service Provider 5] It will allow us to get some commercial contracts in the designated airspace area.\n\t10:29',
 '[Drone Industry Body/Association 1] …..agreed 100% but the operators need to be part of the procedure design.\n\t10:44',
 '[Drone R&D - Academic 9] Watch your politics here Jim!  If "class Lima" gets perceived as a back door to airspace class escalation to suit the drone community, there goes your buy-in\n\t10:46',
 '[Drone Service Provider 5] How about a dress-rehearsal in a country with more permissive rule sets and when we have operated together there, Bring it back as a mitigated operation.\n\t10:46',
 '[Commercial Drone Operator 1] is there not an in between solution where strategic deconfliction is mandatory for UAS operators and not mandatory for GA, but that they can at least see where UAS operations are planned?\n\t10:46',
 '[Commercial Drone Operator 32] I ask the UTM question all the time!\n\t10:47',
 '[Drone Industry Body/Association 1] The poll resul

In [5]:
chat_authors = [author_regex.match(m).group(0).strip('[]') for m in chat_tokens]

assert len(chat_authors) == len(chat_tokens) # or something has gone very wrong...
print(len(set(chat_authors)), ' unique chat authors ')
chat_authors[:10] # SC

19  unique chat authors 


['Drone Service Provider 13',
 'EC Equipment Manufacturer',
 'Drone R&D - Commercial 1',
 'Commercial Drone Operator 30',
 'Drone Service Provider 5',
 'EC Equipment Manufacturer',
 'German Moreno',
 'Drone Service Provider 5',
 'Government Funding Agency',
 'EC Equipment Manufacturer']

In [6]:
chat_timestamps = [date_str + ts_regex.search(m).group(0).strip() for m in chat_tokens]

assert len(chat_timestamps) == len(chat_tokens)
chat_timestamps[:10]

['2021-08-03 09:31',
 '2021-08-03 09:31',
 '2021-08-03 09:31',
 '2021-08-03 09:32',
 '2021-08-03 09:34',
 '2021-08-03 09:38',
 '2021-08-03 09:38',
 '2021-08-03 09:38',
 '2021-08-03 09:38',
 '2021-08-03 09:38']

In [7]:
chat_msgs = [msg_regex.search(m).group(0).strip() for m in chat_tokens]
assert len(chat_msgs) == len(chat_tokens)
chat_msgs[:10]

['This sounds like it is on the development path of a UTM company. Could the UTM company not manage the TDA and allow access to multiple drones as part of their validation path?',
 'I don’t honestly see a single VHF position reporting freq is viable - unless aircraft have 2 radios they are unlikely to be on the frequency as working other units.  It’s likely to be beyond the ability of some GA pilots to take in the information as it will task saturate them.  This is already being tried by the military for low flying but few GA pilots use it - for the reasons above.',
 'The collision avoidance DETECTION is strongly biased to electronic conspicuity, is there no room for Vision and Audio based systems, we have a DTrig grant to develop exactly that, Dennis Motion Robotics',
 'Position reports are perfectly fine when flying over places such as Africa with lack of communications to listen to with ATC. Around the UK, with GA aircraft sometimes only having 1 radio, they will not give up flight 

Pile in all parsed out chat data into dataframe

In [8]:
chat_df = pd.DataFrame({'author': chat_authors, 'time': chat_timestamps, 'message': chat_msgs})
chat_df['author'] = chat_df['author'].astype('string')
chat_df['time'] = pd.to_datetime(chat_df['time'])
chat_df['message'] = chat_df['message'].astype('string')
print(chat_df.dtypes)
chat_df.head()

author             string
time       datetime64[ns]
message            string
dtype: object


Unnamed: 0,author,time,message
0,Drone Service Provider 13,2021-08-03 09:31:00,This sounds like it is on the development path of a UTM company. Could the UTM company not manage the TDA and allow access to multiple drones as part of their validation path?
1,EC Equipment Manufacturer,2021-08-03 09:31:00,I don’t honestly see a single VHF position reporting freq is viable - unless aircraft have 2 radios they are unlikely to be on the frequency as working other units. It’s likely to be beyond the ability of some GA pilots to take in the information as it will task saturate them. This is already being tried by the military for low flying but few GA pilots use it - for the reasons above.
2,Drone R&D - Commercial 1,2021-08-03 09:31:00,"The collision avoidance DETECTION is strongly biased to electronic conspicuity, is there no room for Vision and Audio based systems, we have a DTrig grant to develop exactly that, Dennis Motion Robotics"
3,Commercial Drone Operator 30,2021-08-03 09:32:00,"Position reports are perfectly fine when flying over places such as Africa with lack of communications to listen to with ATC. Around the UK, with GA aircraft sometimes only having 1 radio, they will not give up flight information with a basic / traffic service to listen out to drone position reports."
4,Drone Service Provider 5,2021-08-03 09:34:00,"EC just one part of the CA picture, looks like a lot of reliance on EC for Class L, does not cope with interlopers. Onboard self-generated CA is needed. EO sensors. But I Like the idea of Class L. My other concern is vertical separation. You mentioned altitude but flying at set clearance heights are better. The problem of quadrants is that it pushes drones up into icing and weather, when flying low you avoid this to a larger extend. Separating in the vertical is a poor solution, deconfliction in azimuth is better."


### Presentation Transcript

In [40]:
with open('cleaned_transcript.txt', 'r') as f:
    raw_trans = f.read()

In [41]:
# Get timestamped blocks in transcript
ts_tokens = trans_token_split_regex.split(raw_trans)
ts_tokens = [m.strip() for m in ts_tokens if m and m.strip()] # Filter nulls and strip control chars
print(len(ts_tokens), ' transcript timeblocks ')
_ = [print(t, end='\n--------------------------------------------------------------------\n') for t in ts_tokens] # Print first 10 for sanity check

323  transcript timeblocks 
[Facilitator] My name is [Facilitator] and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a 

In [42]:
3# ts_timestamps = [ts_regex.search(m).group(0).strip() for m in ts_tokens]
ts_timestamps = [date_str + t.strip() for t in trans_token_split_regex.findall(raw_trans)][:-1]
for t in ts_timestamps:
    print(t)
#     if ts_timestamps.count(t) > 1:
#         print(t)

print(len(ts_timestamps))
# assert len(ts_timestamps) == len(ts_tokens)
ts_timestamps[:10]

3

2021-08-03 00:08:48
2021-08-03 00:09:56
2021-08-03 00:10:04
2021-08-03 00:10:39
2021-08-03 00:11:36
2021-08-03 00:11:55
2021-08-03 00:12:10
2021-08-03 00:12:30
2021-08-03 00:12:48
2021-08-03 00:13:17
2021-08-03 00:13:47
2021-08-03 00:13:59
2021-08-03 00:14:07
2021-08-03 00:14:17
2021-08-03 00:14:26
2021-08-03 00:14:39
2021-08-03 00:15:12
2021-08-03 00:15:19
2021-08-03 00:15:29
2021-08-03 00:15:45
2021-08-03 00:16:19
2021-08-03 00:16:28
2021-08-03 00:16:48
2021-08-03 00:17:36
2021-08-03 00:17:57
2021-08-03 00:18:15
2021-08-03 00:18:25
2021-08-03 00:18:34
2021-08-03 00:18:44
2021-08-03 00:19:05
2021-08-03 00:19:29
2021-08-03 00:19:39
2021-08-03 00:20:08
2021-08-03 00:20:18
2021-08-03 00:21:03
2021-08-03 00:21:39
2021-08-03 00:21:49
2021-08-03 00:21:58
2021-08-03 00:22:08
2021-08-03 00:22:42
2021-08-03 00:23:13
2021-08-03 00:23:26
2021-08-03 00:23:35
2021-08-03 00:24:12
2021-08-03 00:24:21
2021-08-03 00:24:56
2021-08-03 00:25:03
2021-08-03 00:25:11
2021-08-03 00:25:15
2021-08-03 00:25:25


['2021-08-03 00:08:48',
 '2021-08-03 00:09:56',
 '2021-08-03 00:10:04',
 '2021-08-03 00:10:39',
 '2021-08-03 00:11:36',
 '2021-08-03 00:11:55',
 '2021-08-03 00:12:10',
 '2021-08-03 00:12:30',
 '2021-08-03 00:12:48',
 '2021-08-03 00:13:17']

In [43]:
# Here not all transcribed messages have an author attached as the timestamps are thrown in the middle of text.
# This presumes the previously tagged author is still talking

ts_authors = []
for m in ts_tokens:
    ssm = author_regex.match(m)
    if ssm:
        ts_authors.append(ssm.group(0).strip('[]'))
    else:
        ts_authors.append(ts_authors[-1])

assert len(ts_authors) == len(ts_tokens)
print(len(set(ts_authors)), ' unique transcript authors ')
set(ts_authors) # SC

12  unique transcript authors 


{'Commercial Drone Operator 1',
 'Drone Industry Body/Association 1',
 'Drone R&D - Academic 8',
 'Drone R&D - Commercial 6',
 'Drone Service Provider 5',
 'Drone Service Provider 8',
 'EC Equipment Manufacturer',
 'Facilitator',
 'Government Funding Agency',
 'Prof. Jim Scanlan',
 'Prof. Tom Cherrett',
 'Regulatory Body 2'}

In [44]:
ts_msgs = [author_regex.sub('', m) for m in ts_tokens]

assert len(ts_msgs) == len(ts_tokens)
ts_msgs[:10]

[" My name is  and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a bit more personable for us, so if you hover over you

In [59]:
print(len(ts_authors))
print(len(ts_timestamps))
print(len(ts_msgs))
trans_df = pd.DataFrame({'author': ts_authors, 'time': ts_timestamps, 'message': ts_msgs})
trans_df['author'] = trans_df['author'].astype('string')
trans_df['time'] = pd.to_datetime(trans_df['time']) + pd.Timedelta(hours=9)
trans_df['message'] = trans_df['message'].astype('string')
print(trans_df.dtypes)
trans_df.head()

323
323
323
author             string
time       datetime64[ns]
message            string
dtype: object


Unnamed: 0,author,time,message
0,Facilitator,2021-08-03 09:08:48,"My name is and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a bit more personable for us, so if you hover over your video select the three dots in the top right you'll get renamed as an option."
1,Facilitator,2021-08-03 09:09:56,"So, this workshop is coming into parts of a research project and the purpose of today is to record the views for you as participants."
2,Facilitator,2021-08-03 09:10:04,"Those who signed up via Eventbrite will have received a sheet information and details of what you're consenting to by participating so to reiterate anyone is free to exit this workshop at any time, the workshop is being recorded for internal use only and all data will be anonymised it may be used as part of the process outputs but it's going to go through an anonymisation process, so you don't need to worry about sharing any details that are going to be shared with the wider public so by remaining in the workshop, from this point you are agreeing to these terms, so if anyone can't consent these terms of being recorded, then it would be best to exit now."
3,Facilitator,2021-08-03 09:10:39,"With that being said, I’ll just outline some technical details, we would like to give everyone a chance to be heard so to make that possible within the limitations of zoom we would invite you to please use the chat which is accessible via the icon bar at the bottom of your screen and the speech bubble icon So if you scroll to the bottom of your screen. You will see a speech bubble icon underneath and you click that it will pop open the chat. If you are on an iPad or tablets or another device, you may need to go to the top of your screen. Where you will then see a similar set of icons and again if you click the three dots, then you will see a drop-down menu, with the option to select. So, on desktops the bottom of your screen there's the chat icon on other devices, you may need to go to the top of your screen and click the three dots to access the chat."
4,Facilitator,2021-08-03 09:11:36,"If you do need to put anything into the chat you also have the option of private messaging me or other participants in the group, if you choose to private message someone directly, I will only be visible to me or the person receiving the message so you're welcome to use that at any time, if you want to point anything out to me you're welcome to do that."


In [60]:
full_df = pd.concat((chat_df, trans_df)).sort_values('time')
full_df.to_csv('interlaced_transcript.csv')
print(full_df.dtypes)
full_df.head()

author             string
time       datetime64[ns]
message            string
dtype: object


Unnamed: 0,author,time,message
0,Facilitator,2021-08-03 09:08:48,"My name is and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a bit more personable for us, so if you hover over your video select the three dots in the top right you'll get renamed as an option."
1,Facilitator,2021-08-03 09:09:56,"So, this workshop is coming into parts of a research project and the purpose of today is to record the views for you as participants."
2,Facilitator,2021-08-03 09:10:04,"Those who signed up via Eventbrite will have received a sheet information and details of what you're consenting to by participating so to reiterate anyone is free to exit this workshop at any time, the workshop is being recorded for internal use only and all data will be anonymised it may be used as part of the process outputs but it's going to go through an anonymisation process, so you don't need to worry about sharing any details that are going to be shared with the wider public so by remaining in the workshop, from this point you are agreeing to these terms, so if anyone can't consent these terms of being recorded, then it would be best to exit now."
3,Facilitator,2021-08-03 09:10:39,"With that being said, I’ll just outline some technical details, we would like to give everyone a chance to be heard so to make that possible within the limitations of zoom we would invite you to please use the chat which is accessible via the icon bar at the bottom of your screen and the speech bubble icon So if you scroll to the bottom of your screen. You will see a speech bubble icon underneath and you click that it will pop open the chat. If you are on an iPad or tablets or another device, you may need to go to the top of your screen. Where you will then see a similar set of icons and again if you click the three dots, then you will see a drop-down menu, with the option to select. So, on desktops the bottom of your screen there's the chat icon on other devices, you may need to go to the top of your screen and click the three dots to access the chat."
4,Facilitator,2021-08-03 09:11:36,"If you do need to put anything into the chat you also have the option of private messaging me or other participants in the group, if you choose to private message someone directly, I will only be visible to me or the person receiving the message so you're welcome to use that at any time, if you want to point anything out to me you're welcome to do that."


At this point, the raw text is tagged with relevant metadata and in a form that is easy to work with. It is, however, still verbatim and contains shedloads of extraneous words etc. This is cleaned next.

## Sentiment Analysis

Strip punctuation, extra whitespace, normalise case. Homogenous text is then tokenised into words, stopwords (eg I, he, she, was, the) are removed then words are lemmatised (taken to root form, eg "going" and "gone" stem from "go")

In [61]:
custom_stopwords = {
    'chat', 'please', 'jim', 'project', 'slide', 'shared', 'janet', 'morning', 'ooo', 'yeah', 'please', 'share', 'facilitator',
    'tom', 'cherrett', 'pdra', 'edinburgh', 'node', 'ta', 'unmute', 'mute', 'hear', 'tapping', 'time', 'bar', 'icon', 'microphone',
    'device', 'icon', 'bar', 'visible', 'screen', 'hover', 'mouse', 'selection', 'icon', 'mobile', 'device', 'browser', 'version',
    'zoom', 'select', 'dot', 'aliaksei', 'pilko', 'nick', 'jelev', 'german', 'moreno', 'dickinson', 'matt', 'grote', 'phd', 'researcher',
    'dr', 'bournemouth', 'management', 'professor', 'scanlan', 'waiting', 'remaining', 'people', 'clock', 'leave', 'couple', 'minute',
    'start', 'proper', 'introduction', 'moment', 'workshop', 'arriving', 'note', 'control', 'participant', 'simultaneously', 'contribute',
    'depending', 'reiterate', 'thumb', 'team', 'attending', 'finding', 'straightforward', 'frustration', 'hearing', 'colleague', 'presentation',
    'explaining', 'background', 'quick', 'housekeeping', 'purpose', 'record', 'view', 'signed', 'eventbrite', 'received', 'sheet', 'consenting',
    'participating', 'exit', 'commitment', 'recorded', 'internal', 'data', 'anonymised', 'output', 'agreeing', 'consent', 'outline', 'chance',
    'heard', 'invite', 'accessible', 'mentioned', 'speech', 'labelled', 'type', 'response', 'option', 'private', 'messaging',
    'submission', 'blue', 'list', 'send', 'message', 'rest', 'comment', 'question', 'return', 'leaving', 'initial', 'clarified', 'ensure',
    'covered', 'simply', 'speak', 'clicking', 'reaction', 'smiley', 'raise', 'hand', 'picture', 'sound', 'track', 'fine', 'cover',
    'clarification', 'ado', 'introduce', 'listen', 'receive', 'user', 'online', 'lot', 'next'
}

In [62]:
punc_regex = re.compile('[^0-9A-Za-z ]')
xws_regex = re.compile('\s+', flags=re.MULTILINE)

nltk.download('punkt')
tok = nltk.tokenize.word_tokenize

# Common stop words plus some custom ones that appear commonly here but are not relevant
sws = swiso.stopwords('en') | custom_stopwords

nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

def _clean_msg(msg):
    res = xws_regex.sub(' ', msg) # Strip extra whitespace
    res = punc_regex.sub(' ', msg) # Strip Punctuation
    res = res.lower() # Force lowercase
    tokens = [t for t in tok(res) if t not in sws and not t.isdigit()] # Tokenise into words
    lemmas = [wn.lemmatize(t) for t in tokens] #Lemmatise
    return [l for l in lemmas if l not in sws and not t.isdigit()] # recheck lemmas for stopwords and return
    
clean_df = deepcopy(full_df)
clean_df['tokens'] = clean_df['message'].apply(_clean_msg)
clean_df['message'] = [m.replace('\n', '') for m in clean_df['message']]
clean_df.head()

  xws_regex = re.compile('\s+', flags=re.MULTILINE)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aliak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aliak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Unnamed: 0,author,time,message,tokens
0,Facilitator,2021-08-03 09:08:48,"My name is and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a bit more personable for us, so if you hover over your video select the three dots in the top right you'll get renamed as an option.","[facilitating, discussion, lead, bit, helpful, displayed, accurate, hovering, panel, video, corner, box, pop, pop, menu, rename, ideally, organization, representing, contextualize, speaking, video, bit, personable, video, renamed]"
1,Facilitator,2021-08-03 09:09:56,"So, this workshop is coming into parts of a research project and the purpose of today is to record the views for you as participants.",[coming]
2,Facilitator,2021-08-03 09:10:04,"Those who signed up via Eventbrite will have received a sheet information and details of what you're consenting to by participating so to reiterate anyone is free to exit this workshop at any time, the workshop is being recorded for internal use only and all data will be anonymised it may be used as part of the process outputs but it's going to go through an anonymisation process, so you don't need to worry about sharing any details that are going to be shared with the wider public so by remaining in the workshop, from this point you are agreeing to these terms, so if anyone can't consent these terms of being recorded, then it would be best to exit now.","[process, anonymisation, process, worry, sharing, wider, public, term, term]"
3,Facilitator,2021-08-03 09:10:39,"With that being said, I’ll just outline some technical details, we would like to give everyone a chance to be heard so to make that possible within the limitations of zoom we would invite you to please use the chat which is accessible via the icon bar at the bottom of your screen and the speech bubble icon So if you scroll to the bottom of your screen. You will see a speech bubble icon underneath and you click that it will pop open the chat. If you are on an iPad or tablets or another device, you may need to go to the top of your screen. Where you will then see a similar set of icons and again if you click the three dots, then you will see a drop-down menu, with the option to select. So, on desktops the bottom of your screen there's the chat icon on other devices, you may need to go to the top of your screen and click the three dots to access the chat.","[technical, limitation, bubble, scroll, bubble, pop, ipad, tablet, set, drop, menu, desktop, access]"
4,Facilitator,2021-08-03 09:11:36,"If you do need to put anything into the chat you also have the option of private messaging me or other participants in the group, if you choose to private message someone directly, I will only be visible to me or the person receiving the message so you're welcome to use that at any time, if you want to point anything out to me you're welcome to do that.","[choose, person, receiving]"


In [63]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentAnalyzer
nltk.download('vader_lexicon')

sa = SentimentIntensityAnalyzer()

clean_df['compound_sentiment'] = [sa.polarity_scores(m)['compound'] for m in clean_df['message']]
clean_df.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Aliak\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

Unnamed: 0,author,time,message,tokens,compound_sentiment
0,Facilitator,2021-08-03 09:08:48,"My name is and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a bit more personable for us, so if you hover over your video select the three dots in the top right you'll get renamed as an option.","[facilitating, discussion, lead, bit, helpful, displayed, accurate, hovering, panel, video, corner, box, pop, pop, menu, rename, ideally, organization, representing, contextualize, speaking, video, bit, personable, video, renamed]",0.9739
1,Facilitator,2021-08-03 09:09:56,"So, this workshop is coming into parts of a research project and the purpose of today is to record the views for you as participants.",[coming],0.0
2,Facilitator,2021-08-03 09:10:04,"Those who signed up via Eventbrite will have received a sheet information and details of what you're consenting to by participating so to reiterate anyone is free to exit this workshop at any time, the workshop is being recorded for internal use only and all data will be anonymised it may be used as part of the process outputs but it's going to go through an anonymisation process, so you don't need to worry about sharing any details that are going to be shared with the wider public so by remaining in the workshop, from this point you are agreeing to these terms, so if anyone can't consent these terms of being recorded, then it would be best to exit now.","[process, anonymisation, process, worry, sharing, wider, public, term, term]",0.9636
3,Facilitator,2021-08-03 09:10:39,"With that being said, I’ll just outline some technical details, we would like to give everyone a chance to be heard so to make that possible within the limitations of zoom we would invite you to please use the chat which is accessible via the icon bar at the bottom of your screen and the speech bubble icon So if you scroll to the bottom of your screen. You will see a speech bubble icon underneath and you click that it will pop open the chat. If you are on an iPad or tablets or another device, you may need to go to the top of your screen. Where you will then see a similar set of icons and again if you click the three dots, then you will see a drop-down menu, with the option to select. So, on desktops the bottom of your screen there's the chat icon on other devices, you may need to go to the top of your screen and click the three dots to access the chat.","[technical, limitation, bubble, scroll, bubble, pop, ipad, tablet, set, drop, menu, desktop, access]",0.8402
4,Facilitator,2021-08-03 09:11:36,"If you do need to put anything into the chat you also have the option of private messaging me or other participants in the group, if you choose to private message someone directly, I will only be visible to me or the person receiving the message so you're welcome to use that at any time, if you want to point anything out to me you're welcome to do that.","[choose, person, receiving]",0.7818


Split transcript into what we said("internal") and comments from attendees ("external")

In [64]:
internal_authors = ['Facilitator', 'Prof. Tom Cherrett', 'Prof. Jim Scanlan', 'German Moreno', 'Aliaksei Pilko']

int_df = clean_df[clean_df['author'].isin(internal_authors)]
ext_df = clean_df[~clean_df['author'].isin(internal_authors)]
int_df.head()
ext_df.head()
int_tokens = nltk.flatten(int_df['tokens'].tolist())
ext_tokens = nltk.flatten(ext_df['tokens'].tolist())
print(len(set(int_tokens)), ' unique internal tokens')
print(len(set(ext_tokens)), ' unique external tokens')

Unnamed: 0,author,time,message,tokens,compound_sentiment
0,Facilitator,2021-08-03 09:08:48,"My name is and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a bit more personable for us, so if you hover over your video select the three dots in the top right you'll get renamed as an option.","[facilitating, discussion, lead, bit, helpful, displayed, accurate, hovering, panel, video, corner, box, pop, pop, menu, rename, ideally, organization, representing, contextualize, speaking, video, bit, personable, video, renamed]",0.9739
1,Facilitator,2021-08-03 09:09:56,"So, this workshop is coming into parts of a research project and the purpose of today is to record the views for you as participants.",[coming],0.0
2,Facilitator,2021-08-03 09:10:04,"Those who signed up via Eventbrite will have received a sheet information and details of what you're consenting to by participating so to reiterate anyone is free to exit this workshop at any time, the workshop is being recorded for internal use only and all data will be anonymised it may be used as part of the process outputs but it's going to go through an anonymisation process, so you don't need to worry about sharing any details that are going to be shared with the wider public so by remaining in the workshop, from this point you are agreeing to these terms, so if anyone can't consent these terms of being recorded, then it would be best to exit now.","[process, anonymisation, process, worry, sharing, wider, public, term, term]",0.9636
3,Facilitator,2021-08-03 09:10:39,"With that being said, I’ll just outline some technical details, we would like to give everyone a chance to be heard so to make that possible within the limitations of zoom we would invite you to please use the chat which is accessible via the icon bar at the bottom of your screen and the speech bubble icon So if you scroll to the bottom of your screen. You will see a speech bubble icon underneath and you click that it will pop open the chat. If you are on an iPad or tablets or another device, you may need to go to the top of your screen. Where you will then see a similar set of icons and again if you click the three dots, then you will see a drop-down menu, with the option to select. So, on desktops the bottom of your screen there's the chat icon on other devices, you may need to go to the top of your screen and click the three dots to access the chat.","[technical, limitation, bubble, scroll, bubble, pop, ipad, tablet, set, drop, menu, desktop, access]",0.8402
4,Facilitator,2021-08-03 09:11:36,"If you do need to put anything into the chat you also have the option of private messaging me or other participants in the group, if you choose to private message someone directly, I will only be visible to me or the person receiving the message so you're welcome to use that at any time, if you want to point anything out to me you're welcome to do that.","[choose, person, receiving]",0.7818


Unnamed: 0,author,time,message,tokens,compound_sentiment
0,Drone Service Provider 13,2021-08-03 09:31:00,This sounds like it is on the development path of a UTM company. Could the UTM company not manage the TDA and allow access to multiple drones as part of their validation path?,"[development, path, utm, company, utm, company, manage, tda, access, multiple, drone, validation, path]",0.5267
1,EC Equipment Manufacturer,2021-08-03 09:31:00,I don’t honestly see a single VHF position reporting freq is viable - unless aircraft have 2 radios they are unlikely to be on the frequency as working other units. It’s likely to be beyond the ability of some GA pilots to take in the information as it will task saturate them. This is already being tried by the military for low flying but few GA pilots use it - for the reasons above.,"[honestly, single, vhf, position, reporting, freq, viable, aircraft, radio, frequency, unit, ability, pilot, task, saturate, military, flying, pilot, reason]",0.2732
2,Drone R&D - Commercial 1,2021-08-03 09:31:00,"The collision avoidance DETECTION is strongly biased to electronic conspicuity, is there no room for Vision and Audio based systems, we have a DTrig grant to develop exactly that, Dennis Motion Robotics","[collision, avoidance, detection, biased, electronic, conspicuity, vision, audio, based, dtrig, grant, develop, dennis, motion, robotics]",-0.4404
3,Commercial Drone Operator 30,2021-08-03 09:32:00,"Position reports are perfectly fine when flying over places such as Africa with lack of communications to listen to with ATC. Around the UK, with GA aircraft sometimes only having 1 radio, they will not give up flight information with a basic / traffic service to listen out to drone position reports.","[position, report, perfectly, flying, africa, lack, communication, atc, aircraft, radio, flight, basic, traffic, service, drone, position, report]",0.5719
4,Drone Service Provider 5,2021-08-03 09:34:00,"EC just one part of the CA picture, looks like a lot of reliance on EC for Class L, does not cope with interlopers. Onboard self-generated CA is needed. EO sensors. But I Like the idea of Class L. My other concern is vertical separation. You mentioned altitude but flying at set clearance heights are better. The problem of quadrants is that it pushes drones up into icing and weather, when flying low you avoid this to a larger extend. Separating in the vertical is a poor solution, deconfliction in azimuth is better.","[reliance, class, cope, interloper, onboard, generated, eo, sensor, idea, class, concern, vertical, separation, altitude, flying, set, clearance, height, quadrant, push, drone, icing, weather, flying, avoid, larger, extend, separating, vertical, poor, solution, deconfliction, azimuth]",0.5499


1015  unique internal tokens
860  unique external tokens


In [65]:
int_freqdist = nltk.FreqDist(int_tokens)
ext_freqdist = nltk.FreqDist(ext_tokens)
print('Top 10 Internal:')
int_freqdist.tabulate(12)
print('\nTop 10 External:')
ext_freqdist.tabulate(12)

Top 10 Internal:
   drone      air    space airspace    class    issue     lima     risk  traffic      utm     term aircraft 
      80       42       36       32       31       29       29       28       21       21       20       20 

Top 10 External:
   drone airspace      utm       eo      air    avoid     risk aircraft  traffic operator    space     lima 
      36       28       27       24       23       21       19       18       16       16       15       15 


In [66]:
ewma_smoothing = 0.16

int_df['cs_ewma'] = int_df['compound_sentiment'].ewm(alpha=ewma_smoothing).mean()
ext_df['cs_ewma'] = ext_df['compound_sentiment'].ewm(alpha=ewma_smoothing).mean()

int_df.head()
ext_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int_df['cs_ewma'] = int_df['compound_sentiment'].ewm(alpha=ewma_smoothing).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ext_df['cs_ewma'] = ext_df['compound_sentiment'].ewm(alpha=ewma_smoothing).mean()


Unnamed: 0,author,time,message,tokens,compound_sentiment,cs_ewma
0,Facilitator,2021-08-03 09:08:48,"My name is and I’m going to be facilitating the discussion today in a moment, we will be hearing from my colleagues who will lead us through a presentation explaining the background and details of this project, but I will just run through a little bit of housekeeping up top to begin with, it would be really helpful for us if everybody could please make sure that the name being displayed on the zoom is accurate to them, so you can do that by hovering over the panel video that shows your screen and when you do that, you will see in the top right corner little blue box, with three dots pops up if you click on that you will then have the option at the bottom of the pop up menu to select rename and then you can type in your name and ideally any organization that you're representing that will just be really useful for us to just contextualize who we're speaking to, especially if nobody's showing videos and things it just makes things a bit more personable for us, so if you hover over your video select the three dots in the top right you'll get renamed as an option.","[facilitating, discussion, lead, bit, helpful, displayed, accurate, hovering, panel, video, corner, box, pop, pop, menu, rename, ideally, organization, representing, contextualize, speaking, video, bit, personable, video, renamed]",0.9739,0.9739
1,Facilitator,2021-08-03 09:09:56,"So, this workshop is coming into parts of a research project and the purpose of today is to record the views for you as participants.",[coming],0.0,0.444607
2,Facilitator,2021-08-03 09:10:04,"Those who signed up via Eventbrite will have received a sheet information and details of what you're consenting to by participating so to reiterate anyone is free to exit this workshop at any time, the workshop is being recorded for internal use only and all data will be anonymised it may be used as part of the process outputs but it's going to go through an anonymisation process, so you don't need to worry about sharing any details that are going to be shared with the wider public so by remaining in the workshop, from this point you are agreeing to these terms, so if anyone can't consent these terms of being recorded, then it would be best to exit now.","[process, anonymisation, process, worry, sharing, wider, public, term, term]",0.9636,0.648485
3,Facilitator,2021-08-03 09:10:39,"With that being said, I’ll just outline some technical details, we would like to give everyone a chance to be heard so to make that possible within the limitations of zoom we would invite you to please use the chat which is accessible via the icon bar at the bottom of your screen and the speech bubble icon So if you scroll to the bottom of your screen. You will see a speech bubble icon underneath and you click that it will pop open the chat. If you are on an iPad or tablets or another device, you may need to go to the top of your screen. Where you will then see a similar set of icons and again if you click the three dots, then you will see a drop-down menu, with the option to select. So, on desktops the bottom of your screen there's the chat icon on other devices, you may need to go to the top of your screen and click the three dots to access the chat.","[technical, limitation, bubble, scroll, bubble, pop, ipad, tablet, set, drop, menu, desktop, access]",0.8402,0.709574
4,Facilitator,2021-08-03 09:11:36,"If you do need to put anything into the chat you also have the option of private messaging me or other participants in the group, if you choose to private message someone directly, I will only be visible to me or the person receiving the message so you're welcome to use that at any time, if you want to point anything out to me you're welcome to do that.","[choose, person, receiving]",0.7818,0.729437


Unnamed: 0,author,time,message,tokens,compound_sentiment,cs_ewma
0,Drone Service Provider 13,2021-08-03 09:31:00,This sounds like it is on the development path of a UTM company. Could the UTM company not manage the TDA and allow access to multiple drones as part of their validation path?,"[development, path, utm, company, utm, company, manage, tda, access, multiple, drone, validation, path]",0.5267,0.5267
1,EC Equipment Manufacturer,2021-08-03 09:31:00,I don’t honestly see a single VHF position reporting freq is viable - unless aircraft have 2 radios they are unlikely to be on the frequency as working other units. It’s likely to be beyond the ability of some GA pilots to take in the information as it will task saturate them. This is already being tried by the military for low flying but few GA pilots use it - for the reasons above.,"[honestly, single, vhf, position, reporting, freq, viable, aircraft, radio, frequency, unit, ability, pilot, task, saturate, military, flying, pilot, reason]",0.2732,0.388928
2,Drone R&D - Commercial 1,2021-08-03 09:31:00,"The collision avoidance DETECTION is strongly biased to electronic conspicuity, is there no room for Vision and Audio based systems, we have a DTrig grant to develop exactly that, Dennis Motion Robotics","[collision, avoidance, detection, biased, electronic, conspicuity, vision, audio, based, dtrig, grant, develop, dennis, motion, robotics]",-0.4404,0.063139
3,Commercial Drone Operator 30,2021-08-03 09:32:00,"Position reports are perfectly fine when flying over places such as Africa with lack of communications to listen to with ATC. Around the UK, with GA aircraft sometimes only having 1 radio, they will not give up flight information with a basic / traffic service to listen out to drone position reports.","[position, report, perfectly, flying, africa, lack, communication, atc, aircraft, radio, flight, basic, traffic, service, drone, position, report]",0.5719,0.225253
4,Drone Service Provider 5,2021-08-03 09:34:00,"EC just one part of the CA picture, looks like a lot of reliance on EC for Class L, does not cope with interlopers. Onboard self-generated CA is needed. EO sensors. But I Like the idea of Class L. My other concern is vertical separation. You mentioned altitude but flying at set clearance heights are better. The problem of quadrants is that it pushes drones up into icing and weather, when flying low you avoid this to a larger extend. Separating in the vertical is a poor solution, deconfliction in azimuth is better.","[reliance, class, cope, interloper, onboard, generated, eo, sensor, idea, class, concern, vertical, separation, altitude, flying, set, clearance, height, quadrant, push, drone, icing, weather, flying, avoid, larger, extend, separating, vertical, poor, solution, deconfliction, azimuth]",0.5499,0.314535


Plot out compound sentiment polarity scores generated by NLTK's pre-trained VADER sentiment analyser model. This scores the sentiment of each message on positivity, negativity and neutrality and combines them to create a compound sentiment score $\in [1,-1]$ where +1 is most positive, -1 is most negative and 0 is neutral sentiment.

The VADER model is trained on social media data so does best at shorter sentences (like most of those in the chat) so is sufficient for a first pass.

Raw scores are expectedly noisy over time so an EWMA pass is used to smooth them out to something readable and indicate general trends

In [76]:
from matplotlib.colors import cnames
import matplotlib.dates as mdates

sentiment_fig, sax = mpl.subplots(1,1, figsize=(12,8))

int_line = sax.plot_date(int_df['time'], int_df['compound_sentiment'], label='Internal Sentiment', color='g')
ext_line = sax.plot_date(ext_df['time'], ext_df['compound_sentiment'], label='External Sentiment', color='r')

# 148 colours to choose from!
# Keep rerunning the cell until you win the intelligible plot colours lottery
rand_colour = lambda *args: np.random.choice(list(cnames.keys())) 

author_colours = {} #Cache colours for each author to prevent duplicate legend entries

for idx, (_,row) in enumerate(int_df.iterrows()):
    # Prevent OOB as we look one ahead of the current row on every iteration
    if idx < len(int_df) - 1:
        author = row['author'] # We want the author that is currently speaking in this time period
        author_start = row['time']
        author_stop = int_df.iloc[idx+1]['time'] - pd.Timedelta(seconds=15)
        label = author
        colour = rand_colour()
        if author not in author_colours:
            author_colours[author] = colour
        else:
            label = '_' #Prevent duplicate legend entries as each axvspan is its own artist
            colour = author_colours[author]
        _ = sax.axvspan(author_start, author_stop, alpha=0.2, color=colour, label=label)
#         print(f'{author} speaking between {author_start} and {author_stop}')

# Prevent these outputting objects by discarding
_ = sax.set_title('Raw Compound Sentiment')
_ = sax.legend()

sentiment_fig.autofmt_xdate()
sentiment_fig.show()

#### Repeat for EWMA values

msentiment_fig, msax = mpl.subplots(1,1, figsize=(12,8))

int_mline = msax.plot_date(int_df['time'], int_df['cs_ewma'], label='Internal Sentiment', color='g', linestyle='-', markersize=2)
ext_mline = msax.plot_date(ext_df['time'], ext_df['cs_ewma'], label='External Sentiment', color='r', linestyle='-', markersize=2)

mauthor_colours = {} #Cache colours for each author to prevent duplicate legend entries

for idx, (_,row) in enumerate(int_df.iterrows()):
    # Prevent OOB as we look one ahead of the current row on every iteration
    if idx < len(int_df) - 1:
        author = row['author'] # We want the author that is currently speaking in this time period
        author_start = row['time']
        author_stop = int_df.iloc[idx+1]['time'] - pd.Timedelta(seconds=15)
        label = author
        colour = author_colours[author]
        if author not in mauthor_colours:
            mauthor_colours[author] = colour
        else:
            label = '_' #Prevent duplicate legend entries as each axvspan is its own artist
            colour = author_colours[author]
        _ = msax.axvspan(author_start, author_stop, alpha=0.2, color=colour, label=label)
#         print(f'{author} speaking between {author_start} and {author_stop}')

# Prevent these outputting objects by discarding
_ = msax.set_title(f'EWMA Compound Sentiment with smoothing $\\alpha$={ewma_smoothing}')
_ = msax.legend()

msentiment_fig.autofmt_xdate()
msentiment_fig.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [77]:
cs_int_df = int_df.sort_values('compound_sentiment')
cs_ext_df = ext_df.sort_values('compound_sentiment')

print('Most Positive internal messages: ')
cs_int_df['message'].tail(10)
print('\n=================================\n')
print('Most Negative internal messages: ')
cs_int_df['message'].head(8)
print('\n\n=================================\n\n')
print('Most Positive external messages: ')
cs_ext_df['message'].tail(10)
print('\n=================================\n')
print('Most Negative external messages: ')
cs_ext_df['message'].head(10)

Most Positive internal messages: 


237                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          And, once again, if anyone would like to share the responses in the chat if that's easier, please feel free



Most Negative internal messages: 


91                                                                                                                                                                But it asked two questions if you fly your drone what is the ground risk and what is the air risk and the frustration to date in doing this is that's very judgmental it's very subjective, and so the specific work  is doing is doing some very detailed mapping and analysis of the whole of the UK and generating numbers and real probabilities.  Both in terms of ground risk and air risk.
57      So we've been trying to think of solutions for longer term for genuine commercial usage and we all know that a lot of worthwhile and very good work has been put into UTM philosophy, UTM thinking, a lot of international studies on the longer term solutions. But we, the exam question, we want to examine is can we do something very simply light touch a lot sooner and, in my opinion, the low hanging fruit for us drone operators is to start in l





Most Positive external messages: 


169                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 So whilst EC  is a great idea and, of course, will be mandated perhaps unmandated electro optical systems that respo



Most Negative external messages: 


192                                                                                                                                                                                                                                                                                                                                                                                                              And yes, whilst the aircraft can be giving its position, some of the military aircraft won't be doing that and we are going to be playing in the same area and same airspace with  high crossing rates and collision angles, so I think that, yes, the EC is to avoid an air prox but the electro optical or whatever method is required is needed to avoid the collision and we only have to avoid the collision by a short distance to have done the job, so I don't think we need to worry about electro optical making us give separation distances it's just you know one inch should be enough to avoid swapping 

## Topic Modelling

This groups the tokens into N topics first at random then iteratively improves the probability of each token being in each topic

In [78]:
int_tokens = list(int_df['tokens'].explode())
ext_tokens = list(ext_df['tokens'].explode())

In [79]:
import gensim
dictionary = gensim.corpora.Dictionary(clean_df['tokens'])
corpus = [dictionary.doc2bow(text) for text in clean_df['tokens']]
lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50)
topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.045*"drone" + 0.016*"air" + 0.015*"risk" + 0.015*"class"')
(1, '0.019*"air" + 0.016*"space" + 0.015*"electronic" + 0.013*"conspicuity"')
(2, '0.021*"operator" + 0.013*"traffic" + 0.012*"drone" + 0.011*"aircraft"')
(3, '0.018*"airspace" + 0.017*"drone" + 0.015*"eo" + 0.008*"aircraft"')
(4, '0.023*"utm" + 0.016*"lima" + 0.014*"airspace" + 0.013*"space"')


In [80]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
lda_display = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(
