# Theory: Extract multiple phrases surrounding keywords into a dataset. Then, anonymize the dataset extractions by simply replacing the rex expression with nothing. After that, apply positive-negative sentiment analysis to the phrase to remove any potential connotations invoked by the initial phrasing.

## Extracting phrases rather than just sentential analysis allows for patterns of speech to be extracted from speeches and then compared for frequent useage.

## It also helps humans build a phrase anonymizing map to go beyond just single words or a priori judgments about what is in speeches

In [1]:
import pandas as pd
import numpy as np
import re
import os
from collections import defaultdict, OrderedDict
import sys

In [2]:
import textblob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [4]:
clinton = pd.read_csv('')
trump = pd.read_csv('')

In [5]:
clinton['month'] = clinton['month'].apply(lambda x: "{:02d}".format(x))
clinton['date'] = clinton['date'].apply(lambda x: "{:02d}".format(x))
clinton['year'] = clinton['year'].astype(str)

trump['month'] = trump['month'].apply(lambda x: "{:02d}".format(x))
trump['date'] = trump['date'].apply(lambda x: "{:02d}".format(x))
trump['year'] = trump['year'].astype(str)

In [6]:
def rex_phrase_creator(word,before_start,before_stop,after_start,after_stop):
    before_rex = []
    after_rex = []
    before_words = '\S* '
    after_words = ' \S*'
    word_rex = '\s*{}\s*'.format(word)
    phrases = []
    
    for i in range(before_start,before_stop):
        temp_b = before_words*i
        temp_b = temp_b
        before_rex.append(('b_{}'.format(i),temp_b))
    
    for i in range(after_start,after_stop):
        temp_a = after_words*i
        temp_a = temp_a
        after_rex.append(('a_{}'.format(i),temp_a))
    
    for b_col,b_rex in before_rex:
        for a_col,a_rex in after_rex:
            total_phrase = '({}{}{})'.format(b_rex,word,a_rex)
            column_name = word + '_' + b_col + '_' +  a_col
            phrases.append((column_name,total_phrase))

    for a_col,a_rex in after_rex:
        total_phrase = '({}{})'.format(word,a_rex)
        phrases.append((word + '_' + a_col,total_phrase))

    for b_col,b_rex in before_rex:
        total_phrase = '({}{})'.format(b_rex,word)
        phrases.append((word + '_' + b_col,total_phrase))
        
    return phrases

In [7]:
def clean_data_for_processing(text_str):
    clean_txt = text_str.replace('\n','').replace('\\','').replace(',','')\
                    .replace('!','.').replace('?','.')\
                    .rstrip(' ').lstrip('')\
                    .split('.')
    
    clean_txt = [x.lstrip(' ').rstrip(' ').lower() for x in clean_txt if x.lstrip(' ').rstrip(' ') != '']
    return clean_txt

In [8]:
def split_text_to_dataframe(txt,source_id,speaker):
    df = pd.DataFrame(data=txt,columns=['sentence'])
    df['source_id'] = source_id
    df['speaker'] = speaker
    return df

In [9]:
def create_rex_searches(rex_args):
    rexes = []
    for search in rex_args:
        searches = rex_phrase_creator(*search)
        rexes.extend(searches)
    return rexes

def apply_rex_to_df(df,rexes):
    apply_df = df.copy()
    for col, rex in rexes:
        apply_df[col] = apply_df['sentence'].str.lower().str.extract(rex)
    return apply_df

# Building automated functions for duplicate phrase extraction

In [10]:
column_searches = []
def phrase_columns_to_seed_word(df,extraction_seeds):
    rexes = []
    # Modify to multi-word searches
    for seed_word in extraction_seeds:
        search_rex = re.compile(r'{}'.format(seed_word))
        rexes.append(search_rex)
    
    seed_word_to_phrase_extractions = []
    for seed, rex in zip(extraction_seeds,rexes):
        
        # Generate a list of the associated extraction columns to the seed words
        extracted_columns = [col for col in df.columns if rex.search(col)]
        seed_word_to_phrase_extraction.append((seed,extracted_columns))
        
    return seed_word_to_phrase_extractions

def count_of_extracted_phrases_by_seed(df,seed_word_to_phrase_extractions):
    local_df = df.copy()
    count_and_mention_columns = []
    for seed_word, extracted_columns in seed_word_to_phrase_extractions:
        count_col_name = seed_word + '_count'
        mention_name = seed_word + '_mentioned'
        
        # Control logic to allow reruns in a notebook environment
        if count_col_name in local_df.columns:
            local_df = local_df.drop(count_col_name,axis=1)
        if mention_name in df.columns:
            local_df = local_df.drop(mention_name,axis=1)
            
        # recombine the tuples of seeds and extraction columns
        # with the column associated with the count of how many extracted phrases
        # are associated with a sentence
        seed_extracted_cols_w_count.append((seed_word,extracted_columns,count_col_name))
        
        # count_column is a row wise sum of how many phrases the seed pulls out
        # with the associated auto-generated phrases
        local_df[count_col_name] = local_df[extracted_columns].notnull().sum(axis=1)
        
        # mention_name is just a boolean to provide simple access to a mask about
        # whether there is any successful extraction of the seed from the sentence
        local_df[mention_name] = local_df[count_col_name] > 0

    return local_df, seed_extracted_cols_w_count

# Groups duplicates by the speaker and then returns all the indexes for duplicates for easy comparison
def indexes_of_duplicated_phrases(df,phrase_columns,count_col):
    local_df = df.copy()
    
    # defaultdict is used so a column can be easily associated with indicies
    # without a host of control logic to construct the key if it doesn't exist
    # Also allows for phrase columns to be mixed and matched for with aliases without key collisions
    indexes_of_duplicated_phrases = defaultdict(set)
    
    # pull out the speaker to group-by and the repackaged list of
    # the seed based extraction columns
    # this is a sub function that requires another loop to pass the seed based columns to it
    for speaker,group_df in local_df[['speaker'] + phrase_columns].groupby('speaker'):
        
        # while a mask is available, this allows future tunability
        group_df = group_df[group_df[count_col] > 0]
        
        # pass over each phrase in phrase based columns to scan for duplicated values
        # Currently this requires just one duplication of a phrase, indicating a duplication
        # to select that phrase as a potential link between two sentences
        #
        # this does not return the actual phrases, but the indicies of the phrases
        # this allows for cross examination of the indicies to provide further control
        # over conditions to use a phrase
        for col in phrase_columns:
            phrase_duplication_s = local_df[col].dropna()
            
            # construct the mask for duplications
            phrase_duplication_s = phrase_duplication_s.duplicated(keep=False)
            
            # apply the mask to the series
            phrase_duplication_s = phrase_duplication_s.loc[phrase_duplication_s]
            indexes_of_duplicated_phrases[col].update(list(phrase_duplication_s.index))
            
    return indexes_of_duplicated_phrases

def reduce_and_associate_sentences_to_phrases(df,indexes_of_duplicated_phrases,phrase_columns):
    # from the indicies, pull the sentences with viable phrases out of the initial dataframe
    # Returns two different kinds of data. A dictionary of the specific phrase rex to the data
    # and a new dataframe of just the sentence, speaker, and matching phrase
    local_df = df.copy()
    generic_phrase_structure_to_phrases = dict()
    all_sentences_to_seeded_phrase = pd.DataFrame(columns=['sentence','speaker','extracting_phrase'])
    for col in phrase_columns:
        phrase_data = local_df.loc[indexes_of_duplicated_phrases[col],['sentence','speaker',col]]\
                                        .sort_values(by=col)
        generic_phrase_structure_to_phrases[col] = phrase_data
        combinable_data = phrase_data.copy()
        combinable_data.columns = ['sentence','speaker','extracting_phrase']
        all_sentences_to_seeded_phrase = all_sentences_to_seeded_phrase.append(combinable_data)
        
    return generic_phrase_structure_to_phrases, all_sentences_to_seeded_phrase

In [12]:
def common_phrase_with_count(df,blocking_words):
    # simple way to add blocking words to remove words like 'a' or 'by' from calculating
    # the length of the actual phrase. So "by me" is just "me" in counting the complexity of the phrase
    phrase_counted_df = pd.DataFrame()
    temp_df = df.copy()
    for sentence, grouped_df in temp_df.groupby('sentence'):
        # reset index because two different phrases will be associated with the same
        # sentence index
        grouped_df.reset_index(inplace=True)
        
        # drop duplicates to look only at each individual phrase, not all instances
        grouped_df = grouped_df.drop_duplicates(subset=['dupe_phrases'])
        first_words = grouped_df['extracting_phrase'].str.extract(r'(^\w+) ')
        first_words = first_words.isin(blocking_words)
        grouped_df = grouped_df.loc[~first_words.values.flatten()]
        grouped_df['phrase_length'] = grouped_df['extracting_phrase'].str.count(' ')
        phrase_length_df = phrase_length_df.append(grouped_df)
        
    phrase_length_df = phrase_length_df.drop_duplicates(subset=['extracting_phrase'])
    
    # tunable parameter to select phrase complexity
    phrase_length_df = phrase_length_df[phrase_length_df['phrase_length'] > 0]

    splitting_phrases = phrase_length_df.sort_values(by='phrase_length',
                                                     ascending=False)['extracting_phrase'].values

    splitting_phrases = [x.strip() for x in splitting_phrases]
    
    # return the phrases that no longer contain blocking words with a df that allows 
    # both human inspectable data for phrases in notebooks and the phrases sorted by length
    
    return phrase_length_df.sort_values(by='phrase_length',ascending=False), sorted_splitting_phrases


def split_sentences_on_common_phrases(df,splitting_phrases):
    local_df = df.copy()
    local_df = local_df.reset_index()
    for index, row in local_df.iterrows():
        for phrase in splitting_phrases:
            # prevent the phrase from splitting on partial words
            if not re.search(r'{}$|{} '.format(phrase,phrase),row['sentence']):
                continue
            
            # loop through the sorted list of phrases to find the longest phrase
            # that successfully splits the sentence into two parts
            # this is a method to merge all the multiple phrases that successfully
            # can be extracted from a sentence into a single maximal phrase that works
            # and is cross-referenced against other sentences
            split_phrases = row['sentence'].split(phrase)
            if len(split_phrases) > 1:
                local_df.at[index,'left_split'] = split_phrases[0]
                local_df.at[index,'right_split'] = split_phrases[1]
                local_df.at[index,'splitting_phrase'] = phrase
                break
    return local_df

In [13]:
def text_blob_analysis(org_df,column):
    df = org_df.copy()
    df = df.fillna('')
    col_sentiment = column + '_sentiment'
    col_subjective = column + '_subjectivity'
    df[col_sentiment] = df[column].apply(lambda x: textblob.TextBlob(x).sentiment[0])
    df[col_subjective] = df[column].apply(lambda x: textblob.TextBlob(x).sentiment[1])
    return df

def vader_analysis(org_df,column):
    df = org_df.copy()
    df = df.fillna('')
    pos_col = column + '_vader_pos'
    neg_col = column + '_vader_neg'
    df[pos_col] = df[column].apply(lambda x: analyser.polarity_scores(x)['pos'])
    df[neg_col] = df[column].apply(lambda x: analyser.polarity_scores(x)['neg'])
    return df

In [14]:
def apply_speech_patterns(org_df):
    df = org_df.copy()
    for index, row in df.iterrows():
        #### Positive parts of speech

        if row['beginning_eq_end_pos'] and row['beginning_eq_phrase_pos']:
            if row['phrase_eq_end_pos']:
                pos_speech_type = 'flat'
            elif row['phrase_gt_end_pos']:
                pos_speech_type = 'flat_high_to_low'
            else:
                pos_speech_type = 'flat_low_to_high'

        elif not row['beginning_eq_end_pos'] and not row['beginning_eq_phrase_pos'] and row['phrase_eq_end_pos']:
            if row['beginning_gt_end_pos']:
                pos_speech_type = 'high_to_flat'
            else:
                pos_speech_type = 'low_to_flat'   

        elif not row['beginning_eq_end_pos'] and row['beginning_eq_phrase_pos'] and not row['phrase_eq_end_pos']:
            if row['beginning_gt_end_pos']:
                pos_speech_type = 'flat_to_low'
            else:
                pos_speech_type = 'flat_to_high'

        elif row['beginning_gt_end_pos'] and row['beginning_gt_phrase_pos']:
            if row['phrase_gt_end_pos']:
                pos_speech_type = 'monotonic_decline'
            else:
                pos_speech_type = 'high_low_middle'

        elif row['beginning_gt_end_pos'] and not row['beginning_gt_phrase_pos']:
            if row['phrase_gt_end_pos']:
                pos_speech_type = 'middle_high_low'
            else:
                pos_speech_type = 'impossible'

        elif not row['beginning_gt_end_pos'] and row['beginning_gt_phrase_pos']:
            if row['phrase_gt_end_pos']:
                pos_speech_type = 'impossible_2'
            else:
                pos_speech_type = 'middle_low_high'

        elif not row['beginning_gt_end_pos'] and not row['beginning_gt_phrase_pos']:
            if row['phrase_gt_end_pos']:
                pos_speech_type = 'low_high_middle'
            else:
                pos_speech_type = 'monotonic_increase'

        ####### Negative parts of speech  
        if row['beginning_eq_end_neg'] and row['beginning_eq_phrase_neg']:
            if row['phrase_eq_end_neg']:
                neg_speech_type = 'flat'
            elif row['phrase_gt_end_neg']:
                neg_speech_type = 'flat_high_to_low'
            else:
                neg_speech_type = 'flat_low_to_high' 

        elif not row['beginning_eq_end_neg'] and not row['beginning_eq_phrase_neg'] and row['phrase_eq_end_neg']:
            if row['beginning_gt_end_neg']:
                neg_speech_type = 'high_to_flat'
            else:
                neg_speech_type = 'low_to_flat'

        elif not row['beginning_eq_end_neg'] and row['beginning_eq_phrase_neg'] and not row['phrase_eq_end_neg']:
            if row['beginning_gt_end_neg']:
                neg_speech_type = 'flat_to_low'
            else:
                neg_speech_type = 'flat_to_high'

        elif not row['beginning_eq_end_neg'] and not row['beginning_eq_phrase_neg']:
            if row['beginning_gt_end_neg']:
                neg_speech_type = 'high_to_flat'
            else:
                neg_speech_type = 'low_to_flat' 

        elif row['beginning_gt_end_neg'] and row['beginning_gt_phrase_neg']:
            if row['phrase_gt_end_neg']:
                neg_speech_type = 'monotonic_decline'
            else:
                neg_speech_type = 'high_low_middle'

        elif row['beginning_gt_end_neg'] and not row['beginning_gt_phrase_neg']:
            if row['phrase_gt_end_neg']:
                neg_speech_type = 'middle_high_low'
            else:
                neg_speech_type = 'impossible'

        elif not row['beginning_gt_end_neg'] and row['beginning_gt_phrase_neg']:
            if row['phrase_gt_end_neg']:
                neg_speech_type = 'impossible_2'
            else:
                neg_speech_type = 'middle_low_high'

        elif not row['beginning_gt_end_neg'] and not row['beginning_gt_phrase_neg']:
            if row['phrase_gt_end_neg']:
                neg_speech_type = 'low_high_middle'
            else:
                neg_speech_type = 'monotonic_increase'

        df.loc[index,'positive_speech_pattern'] = pos_speech_type
        df.loc[index,'negative_speech_pattern'] = neg_speech_type
    return df

In [15]:
def set_up_phrase_dictionaries(sentence_df,searches,investigation_cols):
    local_df = sentence_df.copy()
    search_rexes = create_rex_searches(searches)
    local_df = apply_rex_to_df(local_df,search_rexes)
    extracted_phrase_cols = phrase_columns_to_seed_word(local_df,investigation_cols)

    local_df,seed_phrase_count_cols = count_of_extracted_phrases_by_seed(local_df,extracted_phrase_cols)

    duplicated_indexes = dict()
    master_cols = dict()
    for seed, phrase_cols, count_col in seed_phrase_count_cols:
        
        # check if the phrase column also has the count_column
        # indexes_of_duplicated_phrases selects this column to do its masking
        if count_col not in phrase_cols:
            phrase_cols.append(count_col)
        indexes_of_duplicated_phrases = indexes_of_duplicated_phrases(local_df,phrase_cols,count_col)
        duplicated_indexes[seed] = indexes_of_duplicated_phrases
        
    seed_to_phrase_data_dict = dict()
    seed_to_sentence_and_phrases_dict = dict()
    for seed, phrase_cols, count_col in seed_phrase_count_cols:
        phrase_cols = [col for col in phrase_cols if '_count' not in col]
        seed_to_phrase_data,seed_to_sentence_and_phrases = reduce_and_associate_sentences_to_phrases(local_df,
                                                                             duplicated_indexes[seed],
                                                                             phrase_cols)
        seed_to_phrase_data_dict[seed] = seed_to_phrase_data
        seed_to_sentence_and_phrases_dict[seed] = seed_to_sentence_and_phrases
    
    return seed_to_sentence_and_phrases_dict, seed_to_phrase_data_dict, local_df

In [16]:
sentence_df = pd.DataFrame(columns=['sentence','source_id','speaker'])
for index, row in clinton.iterrows():
    source_id = row['year'] + row['month'] + row['date']
    source_txt = row['text']
    cleaned_txt = clean_data_for_processing(source_txt)
    cleaned_df = split_text_to_dataframe(cleaned_txt,source_id,'clinton')
    sentence_df = sentence_df.append(cleaned_df)

for index, row in trump.iterrows():
    source_id = row['year'] + row['month'] + row['date']
    source_txt = row['text']
    cleaned_txt = clean_data_for_processing(source_txt)
    cleaned_df = split_text_to_dataframe(cleaned_txt,source_id,'trump')
    sentence_df = sentence_df.append(cleaned_df)

In [17]:
sentence_df = sentence_df.reset_index()

In [18]:
sentence_df.columns = ['sentence_location','sentence','source_id','speaker']

In [19]:
searches = [('god',1,4,1,4),
            ('country',3,6,3,6),('our country',3,5,3,5),
            ('america',3,6,3,6),('our america',3,5,3,5),
            ('illegal',1,5,1,5),('illegal alien',2,5,2,5), 
            ('illegal immigrant',2,5,2,5),('immigrant',1,5,1,5),
            ('democracy',1,4,1,4),('huge',1,3,1,3),('great',1,3,1,3),
            ('amazing',1,3,1,3)]

In [20]:
seeding_columns = ['immigrant','god','america','democracy','huge','great','amazing','great']

In [21]:
seed_to_sentence_and_phrases_dict, seed_to_phrase_data_dict, master_df = set_up_seed_to_phrase_data_dictionaries(sentence_df,
                                                                             searches,
                                                                             seeding_columns)

In [22]:
blocking_words = ['was','is','an','by']

In [23]:
seed_to_sentence_and_phrases_dict.keys()

dict_keys(['immigrant', 'god', 'america', 'democracy', 'huge', 'great', 'amazing'])

In [24]:
split_phrases_dfs = []
for seed in seed_to_sentence_and_phrases_dict.keys():
    phrase_counted_df, sorted_splitting_phrases = common_phrase_with_count(seed_to_sentence_and_phrases_dict[seed],blocking_words)
    split_phrases_df = split_sentences_on_common_phrases(seed_to_sentence_and_phrases_dict[seed]\
                                                   .drop_duplicates(subset=['sentence']),sorted_splitting_phrases)
    split_phrases_dfs.append(split_phrases_df)

split_phrases_df = pd.concat(split_phrases_dfs)

In [26]:
split_phrases_df.sample(10)

Unnamed: 0,index,sentence,speaker,dupe_phrases,left_split,right_split,splitting_phrase
1358,44049,i have great foreign advisers,trump,have great,,foreign advisers,i have great
252,15384,i mean it was such a great thing,trump,a great thing,i mean it was,thing,such a great
1578,27880,we are living through the greatest jobs theft ...,trump,the great,,,
689,210,we're going to invest in america again,clinton,to invest in america,we're going,again,to invest in america
1487,18738,and by the way that's number one from the huma...,trump,that's great,and by the way that's number one from the huma...,,that's great
67,36833,we are going to provide school choice to every...,trump,america and we are,we are going to provide school choice to every...,end common core,income child in america and we are going to
937,11608,and i saw those great beautiful buildings that...,trump,great beautiful,and i saw those,beautiful buildings that were empty and rotti...,great
124,26654,this is -- i think you people are amazing,trump,are amazing,this is -- i think you,,people are amazing
1337,33409,we're doing great ohio and florida we're doing...,trump,doing great,,ohio and florida,we're doing great
1117,45230,one of them there 21 years and 15 years great ...,trump,great people,one of them there 21 years and 15 years,you know it's just a question of time,great people and


In [27]:
# split_phrases_df = text_blob_analysis(split_phrases_df,'left_split')
# split_phrases_df = text_blob_analysis(split_phrases_df,'right_split')
# split_phrases_df = text_blob_analysis(split_phrases_df,'splitting_phrase')
# split_phrases_df = text_blob_analysis(split_phrases_df,'sentence')

split_phrases_df = vader_analysis(split_phrases_df,'left_split')
split_phrases_df = vader_analysis(split_phrases_df,'right_split')
split_phrases_df = vader_analysis(split_phrases_df,'splitting_phrase')
split_phrases_df = vader_analysis(split_phrases_df,'sentence')

In [29]:
cross_comparisons = [('beginning_gt_end_neg',['left_split_vader_neg','right_split_vader_neg']),
                    ('beginning_gt_phrase_neg',['left_split_vader_neg','splitting_phrase_vader_neg']),
                    ('phrase_gt_end_neg',['splitting_phrase_vader_neg','right_split_vader_neg']),
                    ('beginning_gt_end_pos',['left_split_vader_pos','right_split_vader_pos']),
                    ('beginning_gt_phrase_pos',['left_split_vader_pos','splitting_phrase_vader_pos']),
                    ('phrase_gt_end_pos',['splitting_phrase_vader_pos','right_split_vader_pos'])]

eq_cross_comparisons = [('beginning_eq_end_neg',['left_split_vader_neg','right_split_vader_neg']),
                    ('beginning_eq_phrase_neg',['left_split_vader_neg','splitting_phrase_vader_neg']),
                    ('phrase_eq_end_neg',['splitting_phrase_vader_neg','right_split_vader_neg']),
                    ('beginning_eq_end_pos',['left_split_vader_pos','right_split_vader_pos']),
                    ('beginning_eq_phrase_pos',['left_split_vader_pos','splitting_phrase_vader_pos']),
                    ('phrase_eq_end_pos',['splitting_phrase_vader_pos','right_split_vader_pos'])]

In [31]:
for cross_comp in cross_comparisons:
    split_phrases_df[cross_comp[0]] = split_phrases_df[cross_comp[1][0]] > split_phrases_df[cross_comp[1][1]]

for cross_comp in eq_cross_comparisons:
    split_phrases_df[cross_comp[0]] = split_phrases_df[cross_comp[1][0]] == split_phrases_df[cross_comp[1][1]]

In [33]:
split_phrases_df = apply_speech_patterns(split_phrases_df)

split_phrases_df['full_speech_pattern'] = split_phrases_df['positive_speech_pattern'] + '__' + split_phrases_df['negative_speech_pattern']

In [35]:
split_phrases_df.sample(10)

Unnamed: 0,index,sentence,speaker,dupe_phrases,left_split,right_split,splitting_phrase,left_split_vader_pos,left_split_vader_neg,right_split_vader_pos,...,phrase_gt_end_pos,beginning_eq_end_neg,beginning_eq_phrase_neg,phrase_eq_end_neg,beginning_eq_end_pos,beginning_eq_phrase_pos,phrase_eq_end_pos,positive_speech_pattern,negative_speech_pattern,full_speech_pattern
1030,8914,great guys,trump,great guys,,,great guys,0.0,0.0,0.0,...,True,True,True,True,True,False,False,low_high_middle,flat,low_high_middle__flat
386,35544,that is how we will truly make america great a...,trump,america great again,that is how we will truly,,make america great again,0.367,0.0,0.0,...,True,True,True,True,False,False,False,middle_high_low,flat,middle_high_low__flat
1549,15949,we're living through the greatest jobs theft i...,trump,the great,,,,0.0,0.0,0.0,...,False,True,True,True,True,True,True,flat,flat,flat__flat
1007,35754,we have so many endorsements from such great g...,trump,great great,we have so many endorsements from such,winners,great great,0.0,0.0,1.0,...,False,True,True,True,False,False,True,low_to_flat,flat,low_to_flat__flat
608,27613,i want the entire corrupt washington establish...,trump,our great congressman,i want the entire corrupt washington establish...,over here; he's not corrupt; where's our cong...,our great congressman,0.157,0.0,0.0,...,True,True,True,True,False,False,False,middle_high_low,flat,middle_high_low__flat
1097,27726,we we because this is the great movement there...,trump,great movement,we we because this is,movement there's never been anything like thi...,the great,0.0,0.0,0.0,...,True,False,True,False,True,False,False,low_high_middle,flat_to_high,low_high_middle__flat_to_high
1588,3001,in fact i just ran across the story in las veg...,clinton,the great,,,,0.0,0.0,0.0,...,False,True,True,True,True,True,True,flat,flat,flat__flat
1074,16650,we need -- we lost a great justice justice scalia,trump,great justice,we need -- we,justice justice scalia,lost a great,0.0,0.0,0.872,...,False,True,False,False,False,False,False,monotonic_increase,low_high_middle,monotonic_increase__low_high_middle
19,10762,hillary clinton wants to have completely gover...,trump,freedoms of all america this is what,hillary clinton wants to have completely gover...,that's what she's aiming at,liberties and freedoms of all america this is ...,0.0,0.227,0.0,...,True,False,False,True,True,False,False,low_high_middle,flat,low_high_middle__flat
564,33024,so we have to make great deals before we do an...,trump,make great deals,so we have,before we do anything with cuba or anybody el...,to make great deals,0.0,0.0,0.0,...,True,True,True,True,True,False,False,low_high_middle,flat,low_high_middle__flat


In [36]:
pattern_trump = split_phrases_df[split_phrases_df['speaker'] == 'trump']
pattern_clinton = split_phrases_df[split_phrases_df['speaker'] == 'clinton']

In [37]:
pattern_clinton['full_speech_pattern'].value_counts() / pattern_clinton.shape[0]

0.7288801571709234

In [38]:
pattern_trump['full_speech_pattern'].value_counts() / pattern_trump.shape[0]

0.7459309249702263

In [39]:
pattern_trump['positive_speech_pattern'].value_counts() / pattern_trump.shape[0]

low_high_middle       0.682811
middle_high_low       0.187773
flat                  0.086542
monotonic_increase    0.021437
monotonic_decline     0.013894
low_to_flat           0.004764
high_to_flat          0.001588
flat_to_high          0.000794
middle_low_high       0.000397
Name: positive_speech_pattern, dtype: float64

In [40]:
pattern_trump['negative_speech_pattern'].value_counts() / pattern_trump.shape[0]

flat               0.865026
high_to_flat       0.061532
flat_to_high       0.054387
low_high_middle    0.010322
low_to_flat        0.008734
Name: negative_speech_pattern, dtype: float64

In [41]:
pattern_clinton['positive_speech_pattern'].value_counts() / pattern_clinton.shape[0]

low_high_middle       0.664047
middle_high_low       0.204322
flat                  0.078585
monotonic_increase    0.035363
monotonic_decline     0.013752
flat_to_high          0.001965
low_to_flat           0.001965
Name: positive_speech_pattern, dtype: float64

In [42]:
pattern_clinton['negative_speech_pattern'].value_counts() / pattern_clinton.shape[0]

flat               0.846758
high_to_flat       0.080550
flat_to_high       0.043222
low_high_middle    0.019646
low_to_flat        0.009823
Name: negative_speech_pattern, dtype: float64