In [1]:
import os
import string
import core_rank
from data import utils
from data import clustering
from data.meeting import meeting_lists
from collections import Counter
from nltk import PerceptronTagger
from nltk import TweetTokenizer
from dictionary_tokenizer import DictionaryTokenizer
from sklearn.model_selection import ParameterGrid
import json
import glob
import re

In [2]:
path_to_stopwords    = 'resources/stopwords/meeting/stopwords.' + 'en' + '.dat'
path_to_filler_words = 'resources/stopwords/meeting/filler_words.' + 'en' + '.txt'
stopwords = utils.load_stopwords(path_to_stopwords)
filler_words = utils.load_filler_words(path_to_filler_words)

In [3]:
def clean_utterance(utterance, filler_words):
    utt = utterance
    # replace consecutive unigrams with a single instance
    utt = re.sub('\\b(\\w+)\\s+\\1\\b', '\\1', utt)
    # same for bigrams
    utt = re.sub('(\\b.+?\\b)\\1\\b', '\\1', utt)
    # strip extra white space
    utt = re.sub(' +', ' ', utt)
    # strip leading and trailing white space
    utt = utt.strip()

    # remove filler words # highly time-consuming
    utt = ' ' + utt + ' '
    for filler_word in filler_words:
        utt = re.sub(' ' + filler_word + ' ', ' ', utt)
        utt = re.sub(' ' + filler_word + ', ', ' ', utt)
        utt = re.sub(' ' + filler_word + '. ', '.', utt)

    return utt

In [4]:
corpus = {}
ids = []
for text in glob.glob('data/transcript/'+'*'+'transcript.txt'):
    f = open(text, "r").read()
    id = re.sub('data/transcript\\\\', '', text)
    id = re.sub('.transcript.txt', '', id)
    raw = re.split('\. |\? |\! |\; ', f)
    utterances = []
    for i, utt in enumerate(raw):
        utt = re.sub("'Kay", 'Okay', utt)
        utt = re.sub("'kay", 'Okay', utt)
        utt = re.sub('"Okay"', 'Okay', utt)
        utt = re.sub("'cause", 'cause', utt)
        utt = re.sub("'Cause", 'cause', utt)
        utt = re.sub('"cause"', 'cause', utt)
        utt = re.sub('"\'em"', 'them', utt)
        utt = re.sub('"\'til"', 'until', utt)
        utt = re.sub('"\'s"', 's', utt)
        utt = re.sub('[.\n]', ' ', utt)

        # l. c. d. -> lcd
        # t. v. -> tv
        utt = re.sub('h. t. m. l.', 'html', utt)
        utt = re.sub(r"(\w)\_ (\w)\_ (\w)\_", r"\1\2\3", utt)
        utt = re.sub(r"(\w)\_ (\w)\_", r"\1\2", utt)
        utt = re.sub(r"(\w)\_", r"\1", utt)

        utt = clean_utterance(utt, filler_words=filler_words)
        # strip extra white space
        utt = re.sub(' +', ' ', utt)
        # strip leading and trailing white space
        utt = utt.strip()

        if utt != '' and utt != '.' and utt != ' ':
            utterances.append((i, utt))
    corpus[id] = utterances
    ids.append(id)

In [5]:
[5]*5

[5, 5, 5, 5, 5]

In [9]:
algorithm        = 'kmeans'
aware            = 'none'
n_comms          = 20
feature          = 'tfidf'
ngram_range      = (1, 1)
extra_features   = []
lsa              = True
lsa_n_components = 30
min_words        = 3
min_elt          = 1
w                = 3
overspanning     = True

In [None]:
for id in ids:
    print (id)

    utterances_indexed = corpus[id]

    # #####################################
    # ### Pre-processing for Clustering ###
    # #####################################
    utterances_processed = []
    lists_of_terms = []
    utterances_remain = []
    for utterance_indexed in utterances_indexed:
        index, utt = utterance_indexed
        utt_cleaned = utils.clean_text(
            utt,
            stopwords=stopwords,
            remove_stopwords=True,
            pos_filtering=False,
            stemming=True,
            # clustering based on lowercase form.
            lower_case=True
        )
        # remove utterances with less than min_words number of non-stopwords
        if len(utt_cleaned) >= min_words:
            utterances_processed.append((index, ' '.join(utt_cleaned)))
            lists_of_terms.append(utt_cleaned)
            utterances_remain.append(utt)
        else:
            # print "drop:", utt
            pass

    print(len(utterances_processed), 'utterances')

    # ############################
    # ### UTTERANCE CLUSTERING ###
    # ############################
    membership = clustering.cluster_utterances(
        utterances_processed,
        algorithm=algorithm,
        aware=aware,
        n_comms=len(utterances_processed)//15,
        feature=feature,
        ngram_range=ngram_range,
        extra_features=extra_features,
        lsa=lsa,
        lsa_n_components=lsa_n_components,
        twidf_window_size=w,
        meeting_id=id
    )
    
    c = dict(Counter(membership))
    comm_labels = [k for k, v in c.items() if v >= min_elt]
    
    path_to_community = 'data/community/meeting/' + 'ami' + '_' + 'v2' + '/'
    if not os.path.exists(path_to_community):
        os.makedirs(path_to_community)
    
    for i, label in enumerate(comm_labels):
        with open(path_to_community + id + '.' + str(i) + '.txt', 'w+') as txtfile:
            for my_label in [sent[0] for j, sent in enumerate(utterances_processed) if membership[j] == label]:
                to_write = [elt[1] for elt in utterances_indexed if elt[0] == my_label][0]
                # one utterance per line
                txtfile.write(to_write + '. ')
            txtfile.write('\n')

EN2001a
667 utterances


In [6]:
summary={}
for text in glob.glob('data/summary/'+'*'+'abssumm.txt'):
    f = open(text, "r").read()
    id = re.sub('data/summary\\\\', '', text)
    id = re.sub('.abssumm.txt', '', id)
    raw = re.split('\. |\? |\! |\; ', f)
    utterances = []
    for utt in raw:
        utt = utt.lower()
        # strip extra white space
        utt = re.sub(' +', ' ', utt)
        # strip leading and trailing white space
        utt = utt.strip()

        if utt != '' and utt != '.' and utt != ' ':
            utterances.append(utt)
    summary[id] = utterances

In [8]:
corpus['TS3010d']

[(1, 'so, now last time'),
 (2, 'can you push the button'),
 (3, 'one time'),
 (4, "so i'm still the secretary"),
 (5, 'now i ask you to presentate the prototype'),
 (6, 'one of your you two'),
 (7, 'yes'),
 (8, 'the prototype'),
 (10, 'now yes'),
 (14, 'and the buttons'),
 (15, 'and the joystick is for the volume and the channels'),
 (16, 'so if joystick and lcd'),
 (17, "what's the rr d"),
 (18, 'very good'),
 (19, 'so, we have so, what they say on the side is put fashion there'),
 (20, 'yes'),
 (21, "it's good"),
 (22, "so, that's it"),
 (23, "that's prototype"),
 (24, 'now, the finance'),
 (25, "we if it's th it if it's"),
 (26, "so, i'm gonna look"),
 (27, 'we have'),
 (28, "finance is it's no, first yes"),
 (29, 'we have to evaluate the product yet'),
 (31, 'criteria'),
 (32, "no, it's"),
 (33, "it's"),
 (34, 'what do you think'),
 (36, 'two'),
 (37, 'you'),
 (38, 'me too'),
 (39, "so it's a three"),
 (40, "so it's a yes, it's for the younger g group"),
 (41, "so it's half of the

In [9]:
summary['TS3010d']

['the industrial designer and user interface designer presented their prototype design, made of yellow rubber and hard plastic, with large, mostly blue buttons, a joystick and lcd screen',
 'led by the marketing expert, the group evaluated the prototype on a scale of one to seven, based on a set of evaluation criteria',
 'the overall rating was two',
 'the project manager calculated the production costs, which were too high at fifteen euros',
 'the group discussed how to make the design cheaper, and decided to keep the lcd screen, but to remove the special colour and replace the joystick with regular push-buttons',
 'finally the project manager led an evaluation of the project process before closing the meeting',
 'overall, the group were satisfied with the creativity, teamwork and available equipment, although the marketing expert thought the smartboard and laptops were sometimes distracting and not that helpful']

In [10]:
for id in summary.keys():
    print (id)

    utterances_indexed = corpus[id]
   
    summ = summary[id]

    # #####################################
    # ### Pre-processing for Clustering ###
    # #####################################
    utterances_processed = []
    lists_of_terms = []
    utterances_remain = []
    for utterance_indexed in utterances_indexed:
        index, utt = utterance_indexed
        utt_cleaned = utils.clean_text(
            utt,
            stopwords=stopwords,
            remove_stopwords=True,
            pos_filtering=False,
            stemming=True,
            # clustering based on lowercase form.
            lower_case=True
        )
        # remove utterances with less than min_words number of non-stopwords
        if len(utt_cleaned) >= min_words:
            utterances_processed.append((index, ' '.join(utt_cleaned)))
            lists_of_terms.append(utt_cleaned)
            utterances_remain.append(utt)
        else:
            # print "drop:", utt
            pass

    print(len(utterances_processed), 'utterances')

    # ############################
    # ### UTTERANCE CLUSTERING ###
    # ############################
    membership = clustering.cluster_utterances(
        utterances_processed,
        algorithm=algorithm,
        aware=aware,
        n_comms=len(utterances_processed)//5,
        feature=feature,
        ngram_range=ngram_range,
        extra_features=extra_features,
        lsa=lsa,
        lsa_n_components=lsa_n_components,
        twidf_window_size=w,
        meeting_id=id,
        sent_num=10
    )
    
    c = dict(Counter(membership))
    comm_labels = [k for k, v in c.items() if v >= min_elt]
    
    path_to_community = 'data/story5/'
    if not os.path.exists(path_to_community):
        os.makedirs(path_to_community)
    
    for i, label in enumerate(comm_labels):
        with open(path_to_community + id + '.' + str(i) + '.story', 'w+') as story_file:
            for my_label in [sent[0] for j, sent in enumerate(utterances_processed) if membership[j] == label]:
                to_write = [elt[1] for elt in utterances_indexed if elt[0] == my_label][0]
                # one utterance per line
                story_file.write(to_write + '. ')
            for sent in summ:
                story_file.write('\n\n@highlight\n\n{}'.format(sent))
            story_file.write('\n')

ES2002a
94 utterances
ES2002b
204 utterances
ES2002c
214 utterances
ES2002d
198 utterances
ES2003a
64 utterances
ES2003b
166 utterances
ES2003c
196 utterances
ES2003d
202 utterances
ES2004a
80 utterances
ES2004b
179 utterances
ES2004c
192 utterances
ES2004d
189 utterances
ES2005a
23 utterances
ES2005b
180 utterances
ES2005c
197 utterances
ES2005d
107 utterances
ES2006a
77 utterances
ES2006b
168 utterances
ES2006c
187 utterances
ES2006d
183 utterances
ES2007a
85 utterances
ES2007b
134 utterances
ES2007c
192 utterances
ES2007d
96 utterances
ES2008a
76 utterances
ES2008b
190 utterances
ES2008c
183 utterances
ES2008d
234 utterances
ES2009a
115 utterances
ES2009b
121 utterances
ES2009c
181 utterances
ES2009d
183 utterances
ES2010a
47 utterances
ES2010b
162 utterances
ES2010c
178 utterances
ES2010d
97 utterances
ES2011a
71 utterances
ES2011b
111 utterances
ES2011c
140 utterances
ES2011d
132 utterances
ES2012a
60 utterances
ES2012b
159 utterances
ES2012c
161 utterances
ES2012d
57 utterances
E

In [77]:
train_files, valid_files, test_files = [], [], []

In [78]:
import random
from random import shuffle
dataset = list(summary.keys())
size = len(dataset)
random.seed(2020)
shuffle(dataset)
for i, f in enumerate(dataset):
    if i < size * 0.1:
        test_files.append(f)
    elif i < size * 0.2:
        valid_files.append(f)
    else:
        train_files.append(f)

In [79]:
train_files, valid_files, test_files

(['IS1006d',
  'ES2008c',
  'ES2003d',
  'ES2012c',
  'ES2003a',
  'IS1003c',
  'ES2011a',
  'ES2008b',
  'ES2010c',
  'IS1000d',
  'TS3006a',
  'ES2002b',
  'TS3006b',
  'IS1002c',
  'IB4011',
  'IS1006b',
  'ES2009a',
  'ES2012d',
  'ES2007b',
  'IS1004c',
  'ES2005c',
  'ES2014c',
  'TS3011b',
  'TS3004c',
  'TS3009b',
  'ES2013b',
  'IS1003b',
  'ES2009b',
  'ES2010d',
  'TS3005c',
  'TS3011c',
  'ES2007c',
  'IS1009b',
  'ES2004a',
  'IS1001c',
  'IS1005b',
  'IS1006a',
  'IS1007c',
  'TS3004a',
  'IS1007b',
  'TS3005a',
  'ES2005a',
  'TS3004d',
  'TS3009a',
  'TS3008a',
  'TS3008b',
  'ES2014b',
  'ES2015c',
  'ES2006a',
  'IS1005a',
  'ES2011b',
  'IS1002b',
  'TS3003d',
  'TS3011d',
  'TS3012c',
  'IS1000c',
  'TS3003b',
  'TS3012b',
  'IS1000b',
  'ES2009d',
  'ES2016c',
  'IS1004b',
  'IS1009a',
  'IS1000a',
  'ES2011d',
  'ES2009c',
  'ES2016d',
  'TS3009c',
  'IS1001b',
  'IS1004d',
  'ES2002d',
  'ES2006d',
  'IS1008c',
  'ES2010b',
  'IS1003d',
  'ES2004d',
  'IS1006c',


In [80]:
print(len(test_files))

15


In [81]:
print(6//5)

1
