In [128]:
import nltk
import numpy as np
from itertools import groupby, chain
from collections import Counter, defaultdict
import spacy
spacy_nlp = spacy.load("en_core_web_sm")
import re

class StatementExtracter(object):
    """
    As part of key word extraction and sentiment summarization, this code will be
    able to extract certain types of statements from a block of text.
    """
    
    def __init__(self, stopwords = None, punctuations = None):
        self.stopwords = stopwords
        self.punctuations = punctuations
        if self.stopwords == None:
            self.stopwords = []
        if self.punctuations == None:
            self.punctuations = list('!"#%&\'()*+,./:;<=>?@[\\]^_`{|}~♪')
        self.phrase_breaks = set(self.stopwords + self.punctuations)
        
    def replace_contraction(self, text):
        """
        Takes in text and replaces certain contractions
        """
        contraction_patterns = [(r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), 
                                (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                                (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'),
                                (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not'),
                                (r'they\'re', 'they are'), (r'They\'re', 'They are'), (r'it\'s', 'it is'), (r'It\'s', 'It is')]
        patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
        for (pattern, repl) in patterns:
            (text, count) = re.subn(pattern, repl, text)
        return text
        
    def spacy_POS_phrase_breaks(self, doc, POS_we_want, tag_we_want):
        """
        Inputs a string of text, a list of POS, and a list of Spacy Tags that we want to keep in.
        This method parses the text and adds words that do not fall into POS and TAGS to phrase breaks list.
        """
        # Initialize the set with our existing phrase breaks
        temp_phrase_breaks = self.phrase_breaks.copy()

        for token in doc:
            if token.pos_ not in POS_we_want and token.tag_ not in tag_we_want:
                temp_phrase_breaks.add(token.text.lower())
        return temp_phrase_breaks
    
    def visualize_POS(self,text):
        """Visualize the POS of each word in the text"""
        #text = self.replace_contraction(text)
        doc = spacy_nlp(text)
        for token in doc:
            print("{0}/{1}/{2} <--{3}-- {4}/{5}/{6}".format(
                    token.text,token.pos_,token.tag_,token.dep_,token.head.text,token.head.pos_,token.head.tag_))        
    
    def is_statements(self, text):
        # Situation where text contains sentences/punctuation
        if "2" in text:
            text = self.replace_contraction(text)
            self.statementExtraction(text, withPunctuation = True)
        # Situation where text does not contain sentences/punctuation
        else:
            text = self.replace_contraction(text)
            self.statementExtraction(text, withPunctuation = False)
        
    def statementExtraction(self, text, withPunctuation = False):
        doc = spacy_nlp(text)
        for token in doc:
            # If the token is "is" or "are", we want to look at the subtree
            #if token.text in ['is','are']:
            if token.pos_ == "VERB":
                # Pull the subtree as token objects and as pure text
                subtree = [i for i in token.subtree]

                # Split the subtree up into left and right groups
                left_subtree = [word for word in subtree if word.i < token.i]
                right_subtree = [word for word in subtree if word.i > token.i]

                if withPunctuation == True:
                    # Create a temporary set of break words based on the Part of Speech
                    left_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','PUNCT','ADP','PRON','AUX','SYM','SCONJ']
                    left_tags = []
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','PUNCT','ADP','AUX','SYM','SCONJ','CCONJ','PRON']
                    right_tags = ['VBG','VBZ','VBP','VB','VBD','VBN']
                if withPunctuation == False:
                    left_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','PRON']
                    left_tags = []
                    right_POS = ['ADJ','DET','NOUN','NUM','PART','PROPN','ADV','ADP','CCONJ']
                    right_tags = ['VB']
                left_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, left_POS, left_tags)
                right_phrase_breaks = self.spacy_POS_phrase_breaks(subtree, right_POS, right_tags)

                # group words together using phrase breaks and a separator 
                left_phrase_groups = groupby(left_subtree, lambda word: word.text.lower() not in left_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                left_phrase_tuples = [tuple(group[1]) for group in left_phrase_groups if group[0] == True]

                # group words together using phrase breaks and a separator 
                right_phrase_groups = groupby(right_subtree, lambda word: word.text.lower() not in right_phrase_breaks)
                # Pull out the groups of words that do not include any of the phrase breaks   
                right_phrase_tuples = [tuple(group[1]) for group in right_phrase_groups if group[0] == True]

                subject = []
                description = []
                
                # For the subjects on the left side of "is/are"
                subject_word = None
                for tuple_ in left_phrase_tuples:
                    for word in tuple_:
                        # if any of the words inside the tuple is a 'nsubj', it is potentially what we want
                        if word.dep_ in ['nsubj', 'expl']:
                            # check if we already have a good candidate for subject
                            if len(subject) < 1:
                                subject_word = word
                                subject = tuple_
                                break
                            elif subject_word.pos_ in ['NOUN'] and word.pos_ not in ['NOUN']:
                                pass
                            # if the existing candidate is not a noun, then we can update it.
                            else:
                                subject_word = word
                                subject = tuple_
                                break
                #print(left_phrase_tuples, right_phrase_tuples)
                # Look at right text, if it includes adj, keep it + or just take the first phrase in the tuple.
                if len(right_phrase_tuples) > 0:
                    description = right_phrase_tuples[0]
                    
                # Create the statements
                print(subtree)
                if len(subject) > 0 and len(description) > 0:
                    output = " ".join([
                        .text for word in subject]) + " " + token.text + " " + " ".join([word.text for word in description])
                    print(output)

In [115]:
import pandas as pd
df = pd.read_csv("videoExamples.csv")
examples = list(df.captions)

In [129]:
se = StatementExtracter()

In [130]:
se.is_statements(examples[3])

[Last, year, ,, Insta360, came, out, with, the, One, X, ,, and, it, totally, revolutionized, the, idea, of, a, 360, camera, could, be, used, what, for, .]
Insta360 came out with the One X
[it, totally, revolutionized, the, idea, of, a, 360, camera, could, be, used, what, for, .]
it revolutionized the idea of a 360 camera could be used what for
[a, 360, camera, could, be, used, what, for]
[First, off, ,, footage, shot, with, the, One, X]
[First, off, ,, footage, shot, with, the, One, X, was, stitched, in, camera, ,, and, then, you, were, able, to, edit, it, right, on, your, phone, and, upload, to, social, platforms, almost, instantly, .]
[to, edit, it, right, on, your, phone, and, upload, to, social, platforms, almost, instantly]
[upload, to, social, platforms, almost, instantly]
[Secondly, ,, they, market, it, not, as, a, 360-degree, camera, but, instead, a, camera, that, could, shoot, 360, degrees, of, possible, angles, .]
they market it not as a 360-degree camera but instead a camera

In [131]:
se.visualize_POS(examples[3])

 /SPACE/_SP <---- (/PUNCT/-LRB-
(/PUNCT/-LRB- <--punct-- music/NOUN/NN
mellow/ADJ/JJ <--amod-- music/NOUN/NN
piano/NOUN/NN <--compound-- music/NOUN/NN
music/NOUN/NN <--ROOT-- music/NOUN/NN
)/PUNCT/-RRB- <--punct-- music/NOUN/NN
(/PUNCT/-LRB- <--punct-- music/NOUN/NN
upbeat/ADJ/JJ <--amod-- music/NOUN/NN
music/NOUN/NN <--appos-- music/NOUN/NN
)/PUNCT/-RRB- <--punct-- music/NOUN/NN
-/PUNCT/. <--punct-- is/AUX/VBZ
This/DET/DT <--nsubj-- is/AUX/VBZ
is/AUX/VBZ <--ROOT-- is/AUX/VBZ
the/DET/DT <--det-- MAX/PROPN/NNP
GoPro/PROPN/NNP <--compound-- MAX/PROPN/NNP
MAX/PROPN/NNP <--attr-- is/AUX/VBZ
./PUNCT/. <--punct-- is/AUX/VBZ
And/CCONJ/CC <--cc-- 's/AUX/VBZ
it/PRON/PRP <--nsubj-- 's/AUX/VBZ
's/AUX/VBZ <--ROOT-- 's/AUX/VBZ
GoPro/PROPN/NNP <--poss-- attempt/NOUN/NN
's/PART/POS <--case-- GoPro/PROPN/NNP
second/ADJ/JJ <--amod-- attempt/NOUN/NN
attempt/NOUN/NN <--attr-- 's/AUX/VBZ
at/ADP/IN <--prep-- attempt/NOUN/NN
a/DET/DT <--det-- level/NOUN/NN
consumer/NOUN/NN <--compound-- level/NOUN/NN
level/

In [122]:
text = examples[2]
doc = spacy_nlp(text)
sentence_tokens = [sents.text for sents in doc.sents]

In [123]:
sentence_tokens

[' Google has been building its own Chromebooks for a while now',
 'first there were two iterations of the Chromebook pixel and then there was 2017 pixel book all three worked great laptops with one huge flaw they cost way too much money',
 "Google's back at it again this year with the new pixel book go and for the first time Google's not trying to make the best Chromebook ever price be damned [Music]",
 'instead Google is trying to build a Chromebook that anyone can use and afford the pixel will go starts at six hundred and fifty dollars a full 350 less than the original pixel book',
 "it's still a lot of money for a Chromebook",
 "but for the first time Google's built something better than the cheap low-end Chromebooks out there that still semi affordable unfortunately in its quest to get the price down Google",
 'also second rest a lot of what made the original pixel looks so intriguing in the first place but for a lot of people',
 "those trade-offs will be worth it all of Google's 