In [None]:
import copy
import json
import pandas as pd
import numpy as np
import nltk

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

%matplotlib inline

matplotlib.rcParams.update({'font.size': 22})

In [None]:
with open('../conversations.json', 'r') as f:
    conversations = json.load(f)
conversations

In [None]:
len(conversations)

In [None]:
data = []
for conv in conversations:
    data_conv = []
    for prompt in conv:
        tokens = nltk.word_tokenize(prompt)
        text = nltk.Text(tokens)
        tags = nltk.pos_tag(tokens)
        data_conv.append({
            'prompt': prompt,
            'tokens': tokens,
            'text': text,
            'tags': tags,
        })
    data.append(data_conv)

In [None]:
from collections import Counter

counters = []
for conv in data:
    for prompt in conv:
        counters.append(Counter(tag for word, tag in prompt['tags']))

print(counters)
d = dict(sum(counters, Counter()))


In [None]:
nltk_lexicon = {
    "CC":      ("conjuction"               , "coordinating conjunction"),
    "CD":      ("digit"               , "cardinal digit"),
    "DT":      ("determiner"               , "determiner"),
    "EX":      ("existential there"               , "existential there"),
    "FW":      ("foreign"               , "foreign word"),
    "IN":      ("preposition"               , "preposition/subordinating conjunction"),
    "JJ":      ("adjective"      , "adjective (e.g. large)"),
    "JJR":     ("adjective"      , "adjective, comparative (e.g. larger)"),
    "JJS":     ("adjective"      , "adjective, superlative (e.g. largest)"),
    "LS":      ("list item"               , "list market"),
    "MD":      ("modal"               , "modal (e.g. could, will)"),
    "NN":      ("noun"           , "noun, singular (e.g. cat, tree)"),
    "NNS":     ("noun"           , "noun plural (e.g. desks)"),
    "NNP":     ("name"           , "proper noun, singular (e.g. Sarah)"),
    "NNPS":    ("name"           , "proper noun, plural (e.g. Indians or Americans)"),
    "PDT":     ("predeterminer"               , "predeterminer (e.g. all, both, half)"),
    "POS":     ("poss. ending"               , "possessive ending (e.g. parent\ ‘s)"),
    "PRP":     ("pers. pronoun"               , "personal pronoun (e.g. hers, herself, him, himself)"),
    "PRP":     ("poss. pronoun"               , "possessive pronoun (e.g. her, his, mine, my, our )"),
    "RB":      ("adverb"         , "adverb (e.g. occasionally, swiftly)"),
    "RBR":     ("adverb"         , "adverb, comparative (e.g. greater)"),
    "RBS":     ("adverb"         , "adverb, superlative (e.g. biggest)"),
    "RP":      ("particle"               , "particle (e.g. about)"),
    "TO":      ("to"               , "infinite marker (e.g. to)"),
    "UH":      ("interjection"               , "interjection (e.g. goodbye)"),
    "VB":      ("verb"           , "verb (e.g. ask)"),
    "VBG":     ("verb"           , "verb gerund (e.g. judging)"),
    "VBD":     ("verb"           , "verb past tense (e.g. pleaded)"),
    "VBN":     ("verb"           , "verb past participle (e.g. reunified)"),
    "VBP":     ("verb"           , "verb, present tense not 3rd person singular (wrap)"),
    "VBZ":     ("verb"           , "verb, present tense with 3rd person singular (bases)"),
    "WDT":     ("wh-word"               , "wh-determiner (e.g. that, what)"),
    "WP":      ("wh-word"               , "wh- pronoun (e.g. who)"),
    "WRB":     ("wh-word"               , "wh- adverb (e.g. how)"),
}

In [None]:
df = pd.DataFrame(d.items(), columns=['tag', 'count'])
df['description'] = df['tag'].map({k: v[1] for k ,v in nltk_lexicon.items()})
df['gen_tag'] = df['tag'].map({k: v[0] if v[0] else k for k ,v in nltk_lexicon.items()})
df['prct'] = df['count'] / (df['count'].sum())

In [None]:
clean_df = df.dropna().sort_values('count', ascending=False)
clean_df

In [None]:
drawable_df = copy.copy(clean_df)
drawable_df.loc[drawable_df['prct'] < 0.01, 'tag'] = "other"
drawable_df.loc[drawable_df['prct'] < 0.01, 'gen_tag'] = "other"
drawable_df.loc[drawable_df['prct'] < 0.01, 'description'] = "other"
drawable_df = drawable_df.groupby(['gen_tag'])['count'].sum().reset_index()
display(drawable_df.sort_values('count', ascending=False))

cmap = plt.cm.twilight
colors = cmap(np.linspace(0, 1, len(drawable_df)))

ax = drawable_df.plot.pie(y='count', labels=[''] * len(drawable_df), autopct='%1.1f%%',
                          startangle=140, figsize=(10, 10), colors=colors,  legend=True)

ax.legend(drawable_df['gen_tag'], title='Part of speech', loc="upper center",
          ncol=1, fontsize='small', bbox_to_anchor=(1.2, 0.9))

autotexts = [text for text in ax.texts if '%' in text.get_text()]

for autotext in autotexts:
    pos = autotext.get_position()
    x = pos[0] * 1.9  # Move outwards by 10%
    y = pos[1] * 1.9
    autotext.set_position((x, y))

ax.set_ylabel('')

plt.savefig('../results/pos_pie.pdf', bbox_inches='tight')

In [None]:
# Conversation sizes

conversation_lengths = [len(conv) for conv in conversations]
print(len(conversation_lengths))

cmap = plt.cm.twilight
colors = cmap(np.linspace(0, 1, 10))

sns.histplot(conversation_lengths, cumulative=True, discrete=True, stat='density', element='bars',
             fill=True, color=colors[2])

plt.ylabel('Frequency (normalised)')
plt.xlabel('Conversation length\n(prompts per conversation)')
plt.savefig('../results/conv_length.pdf', bbox_inches='tight')

In [None]:
words_per_prompt = [len(dc['tokens']) for dc in data_conv]

cmap = plt.cm.twilight
colors = cmap(np.linspace(0, 1, 10))

sns.histplot(words_per_prompt, cumulative=True, discrete=True, stat='density', element='bars',
             fill=True, color=colors[3])
plt.ylabel('Frequency (normalised)')
plt.xlabel('Prompt length\n(words per prompt)')
plt.savefig('../results/prompt_length.pdf', bbox_inches='tight')