# Identifying collocations

<br>

**Language: Python**

This notebook looks at frequency statistics from COCA to identify potential collocations to change in the original and normalized text versions. See dissertation section 6.1.1.1.

**Notebook contents:**
1. [Initial setup](#1.-Initial-setup)
2. [Find collocations](#2.-Find-collocations)

## 1. Initial setup

In [1]:
# Import necessary modules

import pandas as pd
import pprint
from IPython.core.interactiveshell import InteractiveShell
import joblib

In [2]:
# Set preferred notebook format

%pprint # Turn off pretty printing
InteractiveShell.ast_node_interactivity = "all" # Show all output, not just last item
pd.set_option('display.max_columns', 999) # Allow viewing of all columns

Pretty printing has been turned OFF


**Note:** As described in the [README.md]('../README.md'), The frequency information from COCA referenced here is not freely available but can be purchased at https://corpus.byu.edu/coca. Without this data you will be able to see a few rows of these dataframes, but will not be able to run the code yourself.

In [3]:
# Load COCA collocation dataframe

col_df = joblib.load('../../COCA_data/COCA_2020_collocation_df.pkl')
col_df.head()

Unnamed: 0,freq,MI,collocation,tscore
0,2202969,1.85,"((there, e), (be, v))",1435.82
1,1016384,1.67,"((what, d), (do, v))",973.52
2,714570,1.62,"((know, v), (do, v))",815.36
3,714292,1.62,"((do, v), (know, v))",815.19
4,661399,2.24,"((out, i), (of, i))",793.01


In [4]:
# Load texts for analysis (Original and Normalized)

B1_df = joblib.load('../docs/B1_cols.pkl')
B2_df = joblib.load('../docs/B2_cols.pkl')
C1_df = joblib.load('../docs/C1_cols.pkl')

In [5]:
# Combine the three dfs

texts = pd.concat([B1_df, B2_df, C1_df]).reset_index(drop=True)
texts

Unnamed: 0,text_id,lemmas_CLAWS,possible_cols
0,B1_orig,"[(I, p), (disagree, v), (that, d), (point, n),...","[((i, p), (disagree, v)), ((disagree, v), (tha..."
1,B1_norm,"[(I, p), (disagree, v), (that, d), (point, n),...","[((i, p), (disagree, v)), ((disagree, v), (tha..."
2,B2_orig,"[(I, p), (greatly, r), (support, v), (the, a),...","[((i, p), (greatly, r)), ((greatly, r), (suppo..."
3,B2_norm,"[(I, p), (greatly, r), (support, v), (the, a),...","[((i, p), (greatly, r)), ((greatly, r), (suppo..."
4,C1_orig,"[(I, p), (do, v), (agree, v), (to, i), (the, a...","[((i, p), (do, v)), ((do, v), (agree, v)), ((a..."
5,C1_norm,"[(I, p), (do, v), (agree, v), (to, i), (the, a...","[((i, p), (do, v)), ((do, v), (agree, v)), ((a..."


## 2. Find collocations

In [6]:
# Create set of all collocations in COCA

all_cols = set(col_df.collocation.to_list())
len(all_cols)

1992910

In [7]:
# Create column of cols in each text

texts['cols'] = texts.possible_cols.apply(lambda row: [x for x in row if x in all_cols])
texts

Unnamed: 0,text_id,lemmas_CLAWS,possible_cols,cols
0,B1_orig,"[(I, p), (disagree, v), (that, d), (point, n),...","[((i, p), (disagree, v)), ((disagree, v), (tha...","[((our, a), (country, n)), ((their, a), (child..."
1,B1_norm,"[(I, p), (disagree, v), (that, d), (point, n),...","[((i, p), (disagree, v)), ((disagree, v), (tha...","[((our, a), (country, n)), ((their, a), (child..."
2,B2_orig,"[(I, p), (greatly, r), (support, v), (the, a),...","[((i, p), (greatly, r)), ((greatly, r), (suppo...","[((because, i), (of, i)), ((such, i), (as, i))..."
3,B2_norm,"[(I, p), (greatly, r), (support, v), (the, a),...","[((i, p), (greatly, r)), ((greatly, r), (suppo...","[((such, i), (as, i)), ((hard, j), (work, n)),..."
4,C1_orig,"[(I, p), (do, v), (agree, v), (to, i), (the, a...","[((i, p), (do, v)), ((do, v), (agree, v)), ((a...","[((expose, v), (to, i)), ((life, n), (eg, r)),..."
5,C1_norm,"[(I, p), (do, v), (agree, v), (to, i), (the, a...","[((i, p), (do, v)), ((do, v), (agree, v)), ((a...","[((expose, v), (to, i)), ((life, n), (eg, r)),..."


### Prompt collocations

In [8]:
# Prepare prompt text

prompt = "Children who are brought up in families that do not have large amounts of money are better prepared to deal with the problems of adult life than children brought up by wealthy parents. To what extent do you agree or disagree with this opinion?"
prompt = prompt.lower().replace('.','').replace('?','')
prompt

'children who are brought up in families that do not have large amounts of money are better prepared to deal with the problems of adult life than children brought up by wealthy parents to what extent do you agree or disagree with this opinion'

In [9]:
# Tag with CLAWS7 at http://ucrel-api.lancaster.ac.uk/claws/free.html

prompt_POS = 'children_NN2 who_PNQS are_VBR brought_VVN up_RP in_II families_NN2 that_CST do_VD0 not_XX have_VHI large_JJ amounts_NN2 of_IO money_NN1 are_VBR better_RRR prepared_VVN to_TO deal_VVI with_IW the_AT problems_NN2 of_IO adult_NN1 life_NN1 than_CSN children_NN2 brought_VVN up_RP by_II wealthy_JJ parents_NN2 to_II what_DDQ extent_NN1 do_VD0 you_PPY agree_VVI or_CC disagree_VVI with_IW this_DD1 opinion_NN1'
prompt_POS = [x.split('_') for x in prompt_POS.split(' ')]
prompt_POS = [(x[0],x[1][0].lower()) for x in prompt_POS]
prompt_POS

[('children', 'n'), ('who', 'p'), ('are', 'v'), ('brought', 'v'), ('up', 'r'), ('in', 'i'), ('families', 'n'), ('that', 'c'), ('do', 'v'), ('not', 'x'), ('have', 'v'), ('large', 'j'), ('amounts', 'n'), ('of', 'i'), ('money', 'n'), ('are', 'v'), ('better', 'r'), ('prepared', 'v'), ('to', 't'), ('deal', 'v'), ('with', 'i'), ('the', 'a'), ('problems', 'n'), ('of', 'i'), ('adult', 'n'), ('life', 'n'), ('than', 'c'), ('children', 'n'), ('brought', 'v'), ('up', 'r'), ('by', 'i'), ('wealthy', 'j'), ('parents', 'n'), ('to', 'i'), ('what', 'd'), ('extent', 'n'), ('do', 'v'), ('you', 'p'), ('agree', 'v'), ('or', 'c'), ('disagree', 'v'), ('with', 'i'), ('this', 'd'), ('opinion', 'n')]

In [10]:
# Manually lemmatize due to short length

prompt_POS_lem = [('child', 'n'), ('who', 'p'), ('be', 'v'), ('bring', 'v'), ('up', 'r'), ('in', 'i'), ('family', 'n'), ('that', 'c'), ('do', 'v'), ('not', 'x'), ('have', 'v'), ('large', 'j'), ('amount', 'n'), ('of', 'i'), ('money', 'n'), ('be', 'v'), ('better', 'r'), ('prepare', 'v'), ('to', 't'), ('deal', 'v'), ('with', 'i'), ('the', 'a'), ('problem', 'n'), ('of', 'i'), ('adult', 'n'), ('life', 'n'), ('than', 'c'), ('child', 'n'), ('bring', 'v'), ('up', 'r'), ('by', 'i'), ('wealthy', 'j'), ('parent', 'n'), ('to', 'i'), ('what', 'd'), ('extent', 'n'), ('do', 'v'), ('you', 'p'), ('agree', 'v'), ('or', 'c'), ('disagree', 'v'), ('with', 'i'), ('this', 'd'), ('opinion', 'n')]

In [11]:
# Extract potential collocations in span 4

def find_cols(lemma_list):
    col_list = list(zip(lemma_list,lemma_list[1:]))+list(zip(lemma_list,lemma_list[2:]))\
    +list(zip(lemma_list,lemma_list[3:]))+list(zip(lemma_list,lemma_list[4:]))
    return col_list

prompt_cols = find_cols(prompt_POS_lem)

In [12]:
# Remove prompts cols from dataframe cols

texts.possible_cols = texts.possible_cols.apply(lambda row: [x for x in row if x not in prompt_cols])

### Investigate the automated extracted collocations

In [13]:
%pprint

# B1 original

len(set(texts.iloc[0,3]))
set(texts.iloc[0,3])

Pretty printing has been turned ON


12

{(('age', 'n'), ('child', 'n')),
 (('around', 'i'), ('country', 'n')),
 (('child', 'n'), ('age', 'n')),
 (('child', 'n'), ('behavior', 'n')),
 (('child', 'n'), ('family', 'n')),
 (('money', 'n'), ('buy', 'v')),
 (('money', 'n'), ('money', 'n')),
 (('other', 'j'), ('hand', 'n')),
 (('our', 'a'), ('country', 'n')),
 (('pocket', 'n'), ('money', 'n')),
 (('their', 'a'), ('child', 'n')),
 (('their', 'a'), ('parent', 'n'))}

In [14]:
# And if we consider only ones made up of n, v, adj, adv, or preps

lexical = ['n','v','r','j','i']

set([x for x in texts.iloc[0,3] if x[0][1] in lexical and x[1][1] in lexical])

{(('age', 'n'), ('child', 'n')),
 (('around', 'i'), ('country', 'n')),
 (('child', 'n'), ('age', 'n')),
 (('child', 'n'), ('behavior', 'n')),
 (('child', 'n'), ('family', 'n')),
 (('money', 'n'), ('buy', 'v')),
 (('money', 'n'), ('money', 'n')),
 (('other', 'j'), ('hand', 'n')),
 (('pocket', 'n'), ('money', 'n'))}

In [15]:
# B1 normalized

len(set(texts.iloc[1,3]))
set(texts.iloc[1,3])

12

{(('around', 'i'), ('country', 'n')),
 (('as', 'i'), ('journalist', 'n')),
 (('child', 'n'), ('age', 'n')),
 (('child', 'n'), ('family', 'n')),
 (('disagree', 'v'), ('with', 'i')),
 (('money', 'n'), ('buy', 'v')),
 (('other', 'j'), ('hand', 'n')),
 (('our', 'a'), ('country', 'n')),
 (('pocket', 'n'), ('money', 'n')),
 (('positive', 'j'), ('behavior', 'n')),
 (('their', 'a'), ('child', 'n')),
 (('their', 'a'), ('parent', 'n'))}

In [16]:
# B2 original

len(set(texts.iloc[2,3]))
set([x for x in texts.iloc[2,3] if x[0][1] in lexical and x[1][1] in lexical])

40

{(('authentic', 'j'), ('self', 'n')),
 (('basic', 'j'), ('necessity', 'n')),
 (('because', 'i'), ('of', 'i')),
 (('blind', 'v'), ('by', 'i')),
 (('buy', 'v'), ('car', 'n')),
 (('buy', 'v'), ('clothes', 'n')),
 (('buy', 'v'), ('expensive', 'j')),
 (('car', 'n'), ('expensive', 'j')),
 (('care', 'v'), ('about', 'i')),
 (('certain', 'j'), ('such', 'i')),
 (('child', 'n'), ('child', 'n')),
 (('child', 'n'), ('grow', 'v')),
 (('clothes', 'n'), ('toy', 'n')),
 (('commonly', 'r'), ('express', 'v')),
 (('do', 'v'), ('care', 'v')),
 (('do', 'v'), ('know', 'v')),
 (('expensive', 'j'), ('clothes', 'n')),
 (('expensive', 'j'), ('toy', 'n')),
 (('eye', 'n'), ('blind', 'v')),
 (('face', 'v'), ('problem', 'n')),
 (('family', 'n'), ('wealthy', 'j')),
 (('grow', 'v'), ('up', 'r')),
 (('hard', 'j'), ('work', 'n')),
 (('kind', 'n'), ('of', 'i')),
 (('know', 'v'), ('how', 'r')),
 (('make', 'v'), ('money', 'n')),
 (('money', 'n'), ('money', 'n')),
 (('money', 'n'), ('spending', 'n')),
 (('such', 'i'), ('as'

In [17]:
# C1 original and normalized (the same collocations)

len(set(texts.iloc[4,3]))
set([x for x in texts.iloc[4,3] if x[0][1] in lexical and x[1][1] in lexical])

66

{(('adult', 'n'), ('eg', 'r')),
 (('age', 'n'), ('child', 'n')),
 (('age', 'n'), ('eg', 'r')),
 (('agree', 'v'), ('statement', 'n')),
 (('as', 'i'), ('example', 'n')),
 (('because', 'i'), ('of', 'i')),
 (('bill', 'n'), ('gate', 'n')),
 (('child', 'n'), ('eg', 'r')),
 (('child', 'n'), ('family', 'n')),
 (('child', 'n'), ('impoverished', 'j')),
 (('child', 'n'), ('parent', 'n')),
 (('child', 'n'), ('poor', 'j')),
 (('child', 'n'), ('teach', 'v')),
 (('childhood', 'n'), ('eg', 'r')),
 (('contribute', 'v'), ('to', 'i')),
 (('deal', 'v'), ('problem', 'n')),
 (('deal', 'v'), ('with', 'i')),
 (('direct', 'j'), ('contribution', 'n')),
 (('early', 'j'), ('age', 'n')),
 (('early', 'j'), ('eg', 'r')),
 (('economic', 'j'), ('situation', 'n')),
 (('economic', 'j'), ('social', 'j')),
 (('eg', 'r'), ('in', 'i')),
 (('eg', 'r'), ('learn', 'v')),
 (('eg', 'r'), ('may', 'v')),
 (('eg', 'r'), ('on', 'i')),
 (('eg', 'r'), ('work', 'v')),
 (('expose', 'v'), ('to', 'i')),
 (('family', 'n'), ('income', 'n'))

[Back to top](#Identifying-collocations)