In [1]:
from pathlib import Path

In [2]:
from datetime import datetime
def time_now():
    '''Get Current Time'''
    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S.%f")
    print("Current Time =", current_time)
    return now


############
print('Starting...')
start = time_now()


import swifter

import pandas as pd

import regex as re

from itertools import chain

from collections import Counter


import os

Starting...
Current Time = 20:16:45.202538


In [3]:
pkl_files = []
for root, folders, files in os.walk('NLP_JS/'):
    files = [os.path.join(root, file) for file in files if file.endswith('.pkl')]
    pkl_files.append(files)
    pkl_files = sorted(chain(*pkl_files))

In [4]:
def most_common_item(list_01):
    return Counter(list_01).most_common()[0][0]

In [5]:
df = pd.DataFrame({'FILE': pkl_files})

In [6]:
df['TYPE'] = df.FILE.apply(lambda x: re.split('BATCH_\d+_', x)[-1].split('.pkl')[0])

In [7]:
df = df.groupby(['TYPE'], as_index=False).agg({'FILE': lambda x: x.tolist()})

In [8]:
df

Unnamed: 0,TYPE,FILE
0,PAT_NOUN,"[NLP_JS/BATCH_01_PAT_NOUN.pkl, NLP_JS/BATCH_02..."
1,PAT_NOUN_VERB_PHRASE,"[NLP_JS/BATCH_01_PAT_NOUN_VERB_PHRASE.pkl, NLP..."
2,PAT_N_N,"[NLP_JS/BATCH_01_PAT_N_N.pkl, NLP_JS/BATCH_02_..."
3,PAT_N_P_V,"[NLP_JS/BATCH_01_PAT_N_P_V.pkl, NLP_JS/BATCH_0..."
4,PAT_VERB_NOUN_PHRASE,"[NLP_JS/BATCH_01_PAT_VERB_NOUN_PHRASE.pkl, NLP..."
5,POS_ADJECTIVES,"[NLP_JS/BATCH_01_POS_ADJECTIVES.pkl, NLP_JS/BA..."
6,POS_ADVERBS,"[NLP_JS/BATCH_01_POS_ADVERBS.pkl, NLP_JS/BATCH..."
7,POS_NOUNS,"[NLP_JS/BATCH_01_POS_NOUNS.pkl, NLP_JS/BATCH_0..."
8,POS_VERBS,"[NLP_JS/BATCH_01_POS_VERBS.pkl, NLP_JS/BATCH_0..."


In [9]:
dict_patterns_and_pos = dict(zip(df.TYPE, df.FILE))

In [10]:
dict_patterns_and_pos['PAT_NOUN']

['NLP_JS/BATCH_01_PAT_NOUN.pkl',
 'NLP_JS/BATCH_02_PAT_NOUN.pkl',
 'NLP_JS/BATCH_03_PAT_NOUN.pkl',
 'NLP_JS/BATCH_04_PAT_NOUN.pkl',
 'NLP_JS/BATCH_05_PAT_NOUN.pkl',
 'NLP_JS/BATCH_06_PAT_NOUN.pkl',
 'NLP_JS/BATCH_07_PAT_NOUN.pkl',
 'NLP_JS/BATCH_08_PAT_NOUN.pkl',
 'NLP_JS/BATCH_09_PAT_NOUN.pkl',
 'NLP_JS/BATCH_10_PAT_NOUN.pkl',
 'NLP_JS/BATCH_11_PAT_NOUN.pkl',
 'NLP_JS/BATCH_12_PAT_NOUN.pkl',
 'NLP_JS/BATCH_13_PAT_NOUN.pkl',
 'NLP_JS/BATCH_14_PAT_NOUN.pkl']

In [11]:
dict_patterns = {k:v for k,v in dict_patterns_and_pos.items() if 'PAT_' in k}

In [12]:
len(dict_patterns)

5

In [13]:
dict_pos = {k:v for k,v in dict_patterns_and_pos.items() if 'POS_' in k}

In [14]:
len(dict_pos)

4

In [15]:
agg_dir = input('''Where to save results?
Give a name for the directory!
''')
Path(agg_dir).mkdir(parents=True, exist_ok=True)
agg_dir = re.sub(r'[\/\.]+', '', agg_dir)
print(f'Results are in {agg_dir} folder')

Where to save results?
Give a name for the directory!
 AGG_DFS


Results are in AGG_DFS folder


In [16]:
agg_dir

'AGG_DFS'

In [17]:
pd.read_pickle(dict_pos['POS_ADJECTIVES'][0])

Unnamed: 0,TEXT,FREQ
0,same,3808
1,different,3671
2,sure,2945
3,new,2925
4,little,2858
...,...,...
2362,j,1
2363,import.meta.vt,1
2364,swept,1
2365,emojis,1


In [18]:
pat_columns = 'TEXT	LEMMA	POS'.split()
df_pat_main = pd.DataFrame(data= {i:[] for i in pat_columns})


# SPEECH PATTERNS AGGREGATION

In [19]:
for k,v in dict_patterns.items():
    print(f'Processing: {k}')
    df_pat_main = pd.DataFrame(data= {i:[] for i in pat_columns})
    file_name = f'AGG_{k}.pkl'
    
    for i in v:
        print(f'Reading: {i}')
        df_batch = pd.read_pickle(i).dropna()
        df_pat_main = df_pat_main.append(df_batch).reset_index(drop=True)
        print(f'Finished appending: {i}')
    print(f'Finished: {k}')
    
    df_pat_main = df_pat_main.groupby(['LEMMA'], as_index=False).agg({'TEXT': lambda x: x.tolist(), 'POS': lambda x: x.tolist()})
    df_pat_main['FREQ'] = df_pat_main.TEXT.apply(lambda x: len(x))
    
    df_pat_main['POS'] = df_pat_main.POS.apply(lambda x: most_common_item(x))
    df_pat_main['TEXT'] = df_pat_main.TEXT.apply(lambda x: most_common_item(x))
    
    df_pat_main = df_pat_main.sort_values(['FREQ'], ascending=False).reset_index(drop=True)
    
    quantile = df_pat_main.FREQ.quantile(0.95)
    print(f'Quantile 95 = {quantile}')
    df_pat_main = df_pat_main[df_pat_main.FREQ >= quantile].reset_index(drop=True)
    
    df_pat_main.to_pickle(f'./{agg_dir}/{file_name}', protocol=4)
    print('-------------------------------------------------------')

Processing: PAT_NOUN
Reading: NLP_JS/BATCH_01_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_01_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_02_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_02_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_03_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_03_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_04_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_04_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_05_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_05_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_06_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_06_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_07_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_07_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_08_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_08_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_09_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_09_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_10_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_10_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_11_PAT_NOUN.pkl
Finished appending: NLP_JS/BATCH_11_PAT_NOUN.pkl
Reading: NLP_JS/BATCH_

In [20]:
pd.read_pickle('AGG_DFS/AGG_PAT_N_P_V.pkl')

Unnamed: 0,LEMMA,TEXT,POS,FREQ
0,width and height,width and height,NOUN CCONJ NOUN,550
1,true or false,true or false,ADJ CCONJ ADJ,463
2,top and bottom,top and bottom,NOUN CCONJ NOUN,396
3,x and y,x and y,PROPN CCONJ PROPN,342
4,bit and piece,bits and pieces,NOUN CCONJ NOUN,312
...,...,...,...,...
74,git and github,git and github,NOUN CCONJ PROPN,41
75,left and right hand side,left and right hand sides,ADJ CCONJ ADJ NOUN NOUN,40
76,back-end development,back-end development,ADJ PUNCT NOUN NOUN,40
77,program and see,program and see,NOUN CCONJ VERB,38


In [21]:
#del df_batch, df_pat_main

# SPEECH POS AGGREGATION

In [22]:
del df_pat_main, df_batch

In [23]:
pos_columns = 'TEXT	FREQ'.split()
df_pos_main = pd.DataFrame(data= {i:[] for i in pos_columns})

In [24]:
for k,v in dict_pos.items():
    print(f'Processing: {k}')
    df_pos_main = pd.DataFrame(data= {i:[] for i in pat_columns})
    file_name = f'AGG_{k}.pkl'
    
    for i in v:
        print(f'Reading: {i}')
        df_batch = pd.read_pickle(i)
        df_pos_main = df_pos_main.append(df_batch).reset_index(drop=True)
        print(f'Finished appending: {i}')
    print(f'Finished: {k}')
    
    df_pos_main = df_pos_main.groupby(['TEXT'], as_index=False).agg({'FREQ': 'sum'})
    
    
    df_pos_main = df_pos_main.sort_values(['FREQ'], ascending=False).reset_index(drop=True)
    quantile = df_pos_main.FREQ.quantile(0.95)
    print(f'Quantile 95 = {quantile}')
    df_pos_main = df_pos_main[df_pos_main.FREQ >= quantile].reset_index(drop=True)
    df_pos_main.to_pickle(f'./{agg_dir}/{file_name}', protocol=4)
    print('-------------------------------------------------------')

Processing: POS_ADJECTIVES
Reading: NLP_JS/BATCH_01_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_01_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_02_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_02_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_03_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_03_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_04_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_04_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_05_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_05_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_06_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_06_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_07_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_07_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_08_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_08_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_09_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_09_POS_ADJECTIVES.pkl
Reading: NLP_JS/BATCH_10_POS_ADJECTIVES.pkl
Finished appending: NLP_JS/BATCH_10_PO

In [25]:
pd.read_pickle('AGG_DFS/AGG_POS_NOUNS.pkl')

Unnamed: 0,TEXT,FREQ
0,thing,97811.0
1,function,63915.0
2,time,54732.0
3,value,50993.0
4,code,50263.0
...,...,...
1564,flexibility,276.0
1565,ingredient,276.0
1566,tier,276.0
1567,tonight,276.0


In [31]:
pd.read_pickle('AGG_DFS/AGG_PAT_VERB_NOUN_PHRASE.pkl')

Unnamed: 0,LEMMA,TEXT,POS,FREQ
0,make sure,make sure,VERB ADJ,13199
1,take a look,take a look,VERB DET NOUN,3910
2,make sense,makes sense,VERB NOUN,2883
3,be a lot,'s a lot,VERB DET NOUN,2740
4,do the same thing,do the same thing,VERB DET ADJ NOUN,2162
...,...,...,...,...
1612,make our code,make our code,VERB PRON NOUN,53
1613,kind of neat,kind of neat,ADV ADV ADJ,53
1614,be different way,are different ways,VERB ADJ NOUN,53
1615,happen behind the scene,happening behind the scenes,VERB ADP DET NOUN,53
