In [4]:
import pandas as pd

In [5]:
pwd

'/home/jupyter/BATCH_03'

In [6]:
import os

In [7]:
import regex as re

In [8]:
from itertools import chain
from collections import Counter

In [9]:
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.matcher import Matcher

# VERB .*? NOUN Pattern

In [10]:
set_pos = set('ADJ ADP ADV AUX CONJ CCONJ DET INTJ NOUN NUM PART PRON PROPN PUNCT SCONJ SYM VERB X SPACE'.split())
set_pos
list_ignore = [i for i in list(set_pos) if i not in 'VERB NOUN'.split()]
pat_verb_noun = [
    {'POS':'VERB'},
    {'POS': {'IN': list_ignore}, 'OP': '*'},
    {'POS': 'NOUN'}
]

In [11]:
def get_pos_lemma_pattern(doc, pat_name='anything', pat_collection='list_of_dictionaries'):
    '''
    Given a spacy doc object; find the span according to the pattern given.
    ''' 
    matcher =  Matcher(vocab = nlp.vocab)
    matcher.add(f'{pat_name}', pat_collection)
    
    
    doc_match = matcher(doc)
    
    
    list_container = []
    for match in doc_match:
        start = match[1]
        end = match[2]
        result = doc[start:end]
        result = [i.lemma_ for i in result] # new!
        result = ' '.join(result)
        list_container.append(result)

    return list_container

In [12]:
def read_pickle_nlp(folder='Path_to_pkl_files', pos_pat = 'Spacy_Pos_Pattern', pat_name='anything', min_freq=7):
    '''
    Read pickled NLP files; and count the frequency of certain part-of-speech patter.
    Must give folder name in the form of dir/; It must be a local directory!
    Also give a valid spacy pos patter in this form: [{'POS':'VERB'}, {'POS': 'NOUN'}]
    '''
    
    input_pickle = [i for i in os.listdir(f'{folder}') if i.endswith('pkl')]

    pkl_paths = [f'./{folder}{i}' for i in os.listdir(f'{folder}') if i.endswith('pkl')]
    input_nlp = list(zip(pkl_paths, input_pickle))
    counter = 1
    container = []
    for i in input_nlp:
        print(f'''Reading: {i[1]}\t@{i[0]}\t{counter} of {len(input_nlp)}
        ''')
        
        df = pd.read_pickle(i[0])
        df['PAT'] = df['NLP'].apply(lambda x: get_pos_lemma_pattern(x, pat_name, pat_collection=[pos_pat]))
        df_result = list(chain(*list(df['PAT'])))
        container.append(df_result)
        print(f'''Finished: {i[1]}\t@{i[0]}\t{counter} of {len(input_nlp)}
        ''')
        print('-'*80)
        counter += 1
    container = list(chain(*container))
    dict_result = Counter(container).most_common()
    dict_result = dict(dict_result)
    dict_result = {k:v for k,v in dict_result.items() if v>=min_freq}
    df_result = pd.DataFrame(data={
        'PAT_V_NAME': dict_result.keys(),
        'FREQ': dict_result.values()
    })
    return df_result

In [13]:
file_result_name = input('Please enter name of the file')

Please enter name of the file BATCH_03_VERB_NOUN_PAT_THINK_TANKS


In [14]:
print(file_result_name)

BATCH_03_VERB_NOUN_PAT_THINK_TANKS


In [15]:
df = read_pickle_nlp('NLP_RESULT/', pos_pat=pat_verb_noun)

Reading: NLP_THINK_TANKS_10K_df_22.pkl	@./NLP_RESULT/NLP_THINK_TANKS_10K_df_22.pkl	1 of 9
        
Finished: NLP_THINK_TANKS_10K_df_22.pkl	@./NLP_RESULT/NLP_THINK_TANKS_10K_df_22.pkl	1 of 9
        
--------------------------------------------------------------------------------
Reading: NLP_THINK_TANKS_10K_df_23.pkl	@./NLP_RESULT/NLP_THINK_TANKS_10K_df_23.pkl	2 of 9
        
Finished: NLP_THINK_TANKS_10K_df_23.pkl	@./NLP_RESULT/NLP_THINK_TANKS_10K_df_23.pkl	2 of 9
        
--------------------------------------------------------------------------------
Reading: NLP_THINK_TANKS_10K_df_19.pkl	@./NLP_RESULT/NLP_THINK_TANKS_10K_df_19.pkl	3 of 9
        
Finished: NLP_THINK_TANKS_10K_df_19.pkl	@./NLP_RESULT/NLP_THINK_TANKS_10K_df_19.pkl	3 of 9
        
--------------------------------------------------------------------------------
Reading: NLP_THINK_TANKS_10K_df_20.pkl	@./NLP_RESULT/NLP_THINK_TANKS_10K_df_20.pkl	4 of 9
        
Finished: NLP_THINK_TANKS_10K_df_20.pkl	@./NLP_RESULT/NLP_THI

In [16]:
df

Unnamed: 0,PAT_V_NAME,FREQ
0,be a lot,2842
1,take place,1541
2,have a lot,1336
3,develop country,839
4,do a lot,732
...,...,...
8002,increase trust,7
8003,give some thought,7
8004,share my thought,7
8005,become a president,7


In [18]:
print(file_result_name)

BATCH_03_VERB_NOUN_PAT_THINK_TANKS


In [19]:
df.to_pickle(f'{file_result_name}.pkl', protocol=4)