# Theory: Extract multiple phrases surrounding keywords into a dataset. Then, anonymize the dataset extractions by simply replacing the rex expression with nothing. After that, apply positive-negative sentiment analysis to the phrase to remove any potential connotations invoked by the initial phrasing.

## Extracting phrases rather than just sentential analysis allows for patterns of speech to be extracted from speeches and then compared for frequent useage.

## It also helps humans build a phrase anonymizing map to go beyond just single words or a priori judgments about what is in speeches

In [1]:
import pandas as pd
import numpy as np
import re
import os
from collections import defaultdict, OrderedDict
import sys

In [2]:
os.getcwd()

'/Users/jameslittiebrant/one_off_python_stuff/speech_comparability/notebooks'

In [3]:
clinton = pd.read_csv('/Users/jameslittiebrant/one_off_python_stuff/speech_comparability/data/Clinton_Speeches_raw.csv')
trump = pd.read_csv('/Users/jameslittiebrant/one_off_python_stuff/speech_comparability/data/Trump_Speeches_raw.csv')

In [4]:
clinton['month'] = clinton['month'].apply(lambda x: "{:02d}".format(x))
clinton['date'] = clinton['date'].apply(lambda x: "{:02d}".format(x))
clinton['year'] = clinton['year'].astype(str)

trump['month'] = trump['month'].apply(lambda x: "{:02d}".format(x))
trump['date'] = trump['date'].apply(lambda x: "{:02d}".format(x))
trump['year'] = trump['year'].astype(str)

In [5]:
def rex_phrase_creator(word,before_start,before_stop,after_start,after_stop):
    before_rex = []
    after_rex = []
    before_words = '\S* '
    after_words = ' \S*'
    word_rex = '\s*{}\s*'.format(word)
    phrases = []
    
    for i in range(before_start,before_stop):
        temp_b = before_words*i
        temp_b = temp_b
        before_rex.append(('b_{}'.format(i),temp_b))
    
    for i in range(after_start,after_stop):
        temp_a = after_words*i
        temp_a = temp_a
        after_rex.append(('a_{}'.format(i),temp_a))
    
    for b_col,b_rex in before_rex:
        for a_col,a_rex in after_rex:
            total_phrase = '({}{}{})'.format(b_rex,word,a_rex)
            column_name = word + '_' + b_col + '_' +  a_col
            phrases.append((column_name,total_phrase))

    for a_col,a_rex in after_rex:
        total_phrase = '({}{})'.format(word,a_rex)
        phrases.append((word + '_' + a_col,total_phrase))

    for b_col,b_rex in before_rex:
        total_phrase = '({}{})'.format(b_rex,word)
        phrases.append((word + '_' + b_col,total_phrase))
        
    return phrases

In [6]:
def clean_data_for_processing(text_str):
    clean_txt = text_str.replace('\n','').replace('\\','').replace(',','')\
                    .replace('!','.').replace('?','.')\
                    .rstrip(' ').lstrip('')\
                    .split('.')
    
    clean_txt = [x.lstrip(' ').rstrip(' ').lower() for x in clean_txt if x.lstrip(' ').rstrip(' ') != '']
    return clean_txt

In [7]:
def split_text_to_dataframe(txt,source_id,speaker):
    df = pd.DataFrame(data=txt,columns=['sentence'])
    df['source_id'] = source_id
    df['speaker'] = speaker
    return df

In [8]:
def create_rex_searches(rex_args):
    rexes = []
    for search in rex_args:
        searches = rex_phrase_creator(*search)
        rexes.extend(searches)
    return rexes

def apply_rex_to_df(df,rexes):
    apply_df = df.copy()
    for col, rex in rexes:
        apply_df[col] = apply_df['sentence'].str.lower().str.extract(rex)
    return apply_df

# Building automated functions for duplicate phrase extraction

In [9]:
column_searches = []
def key_to_columns(df,column_searches):
    rexes = []
    for column_key in column_searches:
        search_rex = re.compile(r'{}'.format(column_key))
        rexes.append(search_rex)
    
    column_selection = []
    for key, rex in zip(column_searches,rexes):
        cols = [col for col in df.columns if rex.search(col)]
        column_selection.append((key,cols))
        
    return column_selection

def key_cols_to_count(df,column_selection):
    local_df = df.copy()
    count_columns = []
    for key, cols in column_selection:
        count_col_name = key + '_count'
        mention_name = key + '_mentioned'
        if count_col_name in df.columns:
            df = df.drop(count_col_name,axis=1)
        if mention_name in df.columns:
            df = df.drop(mention_name,axis=1)
            
        count_columns.append((key,cols,count_col_name))
        local_df[count_col_name] = local_df[cols].notnull().sum(axis=1)
        local_df[mention_name] = local_df[count_col_name] > 0

    return local_df, count_columns

# Groups duplicates by the speaker and then returns all the indexes for duplicates for easy comparison
def duplicate_splitting(df,selection_columns,count_col):
    local_df = df.copy()
    duplicated_phrases = defaultdict(set)
    for speaker,group_df in local_df[['speaker'] + selection_columns].groupby('speaker'):
        group_df = group_df[group_df[count_col] > 0]
        for col in selection_columns:
            duplicated_series = local_df[col].dropna()
            duplicated_series = duplicated_series.duplicated(keep=False)
            duplicated_series = duplicated_series.loc[duplicated_series]
            duplicated_phrases[col].update(list(duplicated_series.index))
    return duplicated_phrases

def split_duplicates_by_column(df,duplicated_phrases,selection_columns):
    local_df = df.copy()
    duplicated_splits = dict()
    master_dupes = pd.DataFrame(columns=['sentence','speaker','dupe_phrases'])
    for col in selection_columns:
        duplicated_data = local_df.loc[duplicated_phrases[col],['sentence','speaker',col]]\
                                        .sort_values(by=col)
        duplicated_splits[col] = duplicated_data
        master_data = duplicated_data.copy()
        master_data.columns = ['sentence','speaker','dupe_phrases']
        master_dupes = master_dupes.append(master_data)
        
    return duplicated_splits, master_dupes

In [10]:
def token_anonymizer(df,drop_columns=False,fill_back_columns=False):
    if drop_columns:
        local_df = df.drop(drop_columns,axis=1).copy()
    else:
        local_df = df.copy()
        
    token_search = re.compile(r'([a-zA-Z ]*)_')
    
    # below expression finds only the extraction columns, no other types
    token_col_pairs = [(col,token_search.search(col).group(1)) for col in local_df.columns]
    for col, token_removal in token_col_pairs:
        local_df[col] = local_df[col].fillna('').str.replace(r'( ?{} ?)'.format(token_removal), ' ').replace('',np.nan)
    
    if fill_back_columns:
        local_df = pd.concat([df[fill_back_columns],local_df],axis=1)
    
    return local_df

In [11]:
sentence_df = pd.DataFrame(columns=['sentence','source_id','speaker'])
for index, row in clinton.iterrows():
    source_id = row['year'] + row['month'] + row['date']
    source_txt = row['text']
    cleaned_txt = clean_data_for_processing(source_txt)
    cleaned_df = split_text_to_dataframe(cleaned_txt,source_id,'clinton')
    sentence_df = sentence_df.append(cleaned_df)

for index, row in trump.iterrows():
    source_id = row['year'] + row['month'] + row['date']
    source_txt = row['text']
    cleaned_txt = clean_data_for_processing(source_txt)
    cleaned_df = split_text_to_dataframe(cleaned_txt,source_id,'trump')
    sentence_df = sentence_df.append(cleaned_df)

In [12]:
sentence_df = sentence_df.reset_index()

In [13]:
sentence_df.columns = ['sentence_location','sentence','source_id','speaker']

In [14]:
searches = [('god',1,4,1,4),
                    ('country',3,6,3,6),('our country',3,5,3,5),
                    ('america',3,6,3,6),('our america',3,5,3,5),
                   ('illegal',1,5,1,5),('illegal alien',2,5,2,5), 
                      ('illegal immigrant',2,5,2,5),('immigrant',1,5,1,5),
           ('democracy',1,4,1,4),('huge',1,3,1,3),('great',1,3,1,3),
           ('amazing',1,3,1,3),('great',1,3,1,3),]

# Use the "searches" to create multiple extraction phrases and then apply to the dataframe

In [15]:
%%time
search_rexes = create_rex_searches(searches)
master_df = apply_rex_to_df(sentence_df,search_rexes)

CPU times: user 1min 20s, sys: 351 ms, total: 1min 20s
Wall time: 1min 20s


# Keys_cols_to_count extracts all the auto-generated regex columns based on the column_sel variable. This can mirror the searches or be a selected subset of them to investigate

In [16]:
column_sel = key_to_columns(master_df,['immigrant','god','america','democracy','huge','great','amazing','great'])

# With the columns now found: key_cols_to_count counts the number of times that the regular expression matched in the spoken sentence, and also creates a _mentioned boolean columns

In [17]:
master_df,count_cols = key_cols_to_count(master_df,column_sel)

# duplicate returns all the indexes of where the specific regex phrase is found in the input dataframe

In [18]:
dupe_selections = dict()
master_cols = dict()
for capture_col, selection_cols, count_col in count_cols:
    if count_col not in selection_cols:
        selection_cols.append(count_col)
    return_data = duplicate_splitting(master_df,selection_cols,count_col)
    dupe_selections[capture_col] = return_data

dupe_selections['america']['america_b_3_a_3']

# split_duplicates_by_column returns a dictionary of the different duplicate dataframes and master_dupes which is a single dataframe of all data

In [19]:
dupe_splits_dict = dict()
master_dupes_dict = dict()
for key,selection_columns,count_col in count_cols:
    selections = [col for col in selection_columns if '_count' not in col]
    dupe_splits,master_dupes = split_duplicates_by_column(master_df,dupe_selections[key],selections)
    dupe_splits_dict[key] = dupe_splits
    master_dupes_dict[key] = master_dupes

In [20]:
speakers_huge = master_dupes_dict['huge']

In [21]:
immigrant_deported = master_dupes_dict['immigrant'][master_dupes_dict['immigrant']['dupe_phrases']\
                                                  .str.contains('deported')]

immigrant_gang = master_dupes_dict['immigrant'][master_dupes_dict['immigrant']['dupe_phrases']\
                                                  .str.contains('gang')]

immigrant_murder = master_dupes_dict['immigrant'][master_dupes_dict['immigrant']['dupe_phrases']\
                                                  .str.contains('murdered')]

immigrant_criminal = master_dupes_dict['immigrant'][master_dupes_dict['immigrant']['dupe_phrases']\
                                                  .str.contains('criminal')]

immigrant_illegal = master_dupes_dict['immigrant'][master_dupes_dict['immigrant']['dupe_phrases']\
                                                  .str.contains('illegal')]

In [22]:
blocking_words = ['was','is','an','by']

In [23]:
def common_phrase_with_count(df,blocking_words):
    phrase_counted_df = pd.DataFrame()
    for sentence, temp_df in df.groupby('sentence'):
        temp_df.reset_index(inplace=True)
        temp_df = temp_df.drop_duplicates(subset=['dupe_phrases'])
        first_words = temp_df['dupe_phrases'].str.extract(r'(\w+) ')
        first_words = first_words.isin(blocking_words)
        temp_df = temp_df.loc[~first_words.values.flatten()]
        temp_df['phrase_length'] = temp_df['dupe_phrases'].str.count(' ')
        phrase_counted_df = phrase_counted_df.append(temp_df)
        
    phrase_counted_df = phrase_counted_df.drop_duplicates(subset=['dupe_phrases'])
    phrase_counted_df = phrase_counted_df[phrase_counted_df['phrase_length'] > 1]

    splitting_phrases = phrase_counted_df.sort_values(by='phrase_length',ascending=False)['dupe_phrases'].values

    splitting_phrases = [x.strip() for x in splitting_phrases]

    return phrase_counted_df.sort_values(by='phrase_length',ascending=False), splitting_phrases

def split_sentences_on_common_phrases(org_df,splitting_phrases):
    df = org_df.copy()
    for index, row in df.iterrows():
        for phrase in splitting_phrases:
            split_phrases = row['sentence'].split(phrase)
            if len(split_phrases) > 1:
                df.loc[index,'left_split'] = split_phrases[0]
                df.loc[index,'right_split'] = split_phrases[1]
                df.loc[index,'splitting_phrase'] = phrase
                break
    df.drop_duplicates
    return df

## POC of splitting technique

# since split_sentences_on_common_phrases is df agnostic, it must be fed only sentences that you want analyzed. Thus, drop duplicates from the feed df that creates the split_phrases

In [30]:
phrase_counted_df, split_phrases = common_phrase_with_count(immigrant_deported,blocking_words)

In [31]:
split_sentences_on_common_phrases(immigrant_deported.drop_duplicates(subset=['sentence']),split_phrases)

Unnamed: 0,sentence,speaker,dupe_phrases,left_split,right_split,splitting_phrase
35207,where kate steinle was murdered by an illegal ...,trump,by an illegal immigrant and deported,where kate steinle was,probably more than five times,murdered by an illegal immigrant and deported
41382,hillary supports total open borders -- that me...,trump,by an illegal immigrant and deported,hillary supports total open borders -- that me...,,murdered by an illegal immigrant and deported
30570,hillary supports totally open borders there go...,trump,by an illegal immigrant deported at,hillary supports totally open borders there go...,times,murdered by an illegal immigrant deported at l...
28085,and strongly supports sanctuary cities like sa...,trump,by an illegal immigrant deported at,and strongly supports sanctuary cities like sa...,times,murdered by an illegal immigrant deported at l...
42777,and strongly supports sanctuary cities like sa...,trump,by an illegal immigrant deported at,and strongly supports sanctuary cities like sa...,times,illegal immigrant deported at least five
19613,there goes your country -- and strongly suppor...,trump,by an illegal immigrant deported at,there goes your country -- and strongly suppor...,times,murdered by an illegal immigrant deported at l...
38886,and strongly supports sanctuary cities like sa...,trump,by an illegal immigrant who was deported,and strongly supports sanctuary cities like sa...,probably more than five times,murdered by an illegal immigrant who was deported
34074,hillary clinton supports totally open borders ...,trump,by an illegal immigrant who was deported,hillary clinton supports totally open borders ...,least five times,murdered by an illegal immigrant who was depor...
27675,hillary supports totally opened borders there ...,trump,by an illegal immigrant who was deported,hillary supports totally opened borders there ...,least five times,murdered by an illegal immigrant who was depor...
26572,where incredible kate steinle was murdered by ...,trump,by an illegal immigrant who had been deported,where incredible kate steinle was,as least five times,murdered by an illegal immigrant who had been ...
