# Re-arranging matrices

In [9]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import pickle
import scipy.sparse as sp

In [2]:
storage_folder = '/home/bastien/lscd/cooc_matrices_GNgrams/'
new_storage_folder = '/home/bastien/lscd/cooc_matrices_GNgrams_V2/'

In [3]:
DECADES = list(range(1890,2000,10))
POS_TAGS = ['A','N','V']
SIDES = ['L','R']

In [4]:
final_targets_df = pd.read_csv('targets_list.csv',delimiter='\t',index_col='ind')
final_targets_df

Unnamed: 0_level_0,word,POS,nb_cooc
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,aa,N,3454
6,ab,N,2866
7,abalone,N,480
8,abandon,V,417
9,abandoned,V,959
...,...,...,...
38569,zoologist,N,453
38570,zoologists,N,480
38571,zoology,N,711
38572,zulu,A,249


In [5]:
pos_groups = final_targets_df.groupby('POS')
nouns_ind = pos_groups.get_group( 'N' ).index
adjs_ind = pos_groups.get_group( 'A' ).index
verbs_ind = pos_groups.get_group( 'V' ).index
inds = { 'N': nouns_ind, 'A': adjs_ind, 'V':verbs_ind }

In [11]:
nouns = pos_groups.get_group( 'N' )['word']
adjs = pos_groups.get_group( 'A' )['word']
verbs = pos_groups.get_group( 'V' )['word']

with open('./nouns_list.pickle','wb') as f:
    pickle.dump(obj=nouns.to_list(), file=f)
with open('./adjs_list.pickle','wb') as f:
    pickle.dump(obj=adjs.to_list(), file=f)
with open('./verbs_list.pickle','wb') as f:
    pickle.dump(obj=verbs.to_list(), file=f)

In [28]:
for decade in tqdm(DECADES,desc='Rearranging '):
    for pos in POS_TAGS:
        matrix = sp.load_npz(storage_folder+f'cooc-matrix_{decade}_{pos}_L.npz')[ inds[pos] , : ]
        sp.save_npz(matrix=matrix,file=new_storage_folder+f'cooc-matrix_{decade}_{pos}_L.npz')
        matrix = sp.load_npz(storage_folder+f'cooc-matrix_{decade}_{pos}_R.npz')[ inds[pos] , : ]
        sp.save_npz(matrix=matrix,file=new_storage_folder+f'cooc-matrix_{decade}_{pos}_R.npz')

Rearranging : 100%|██████████| 11/11 [05:22<00:00, 29.35s/it]


# Checking for no empty line

In [31]:
aggreg_A_matrix = sp.csr_array( (len(adjs_ind),5000) )
aggreg_N_matrix = sp.csr_array( (len(nouns_ind),5000) )
aggreg_V_matrix = sp.csr_array( (len(verbs_ind),5000) )


for decade in tqdm(DECADES,desc='Aggregating A'):
    matrix = sp.load_npz(new_storage_folder+f'cooc-matrix_{decade}_A_L.npz')
    aggreg_A_matrix += matrix
    matrix = sp.load_npz(new_storage_folder+f'cooc-matrix_{decade}_A_R.npz')
    aggreg_A_matrix += matrix
        
for decade in tqdm(DECADES,desc='Aggregating N'):
    matrix = sp.load_npz(new_storage_folder+f'cooc-matrix_{decade}_N_L.npz')
    aggreg_N_matrix += matrix
    matrix = sp.load_npz(new_storage_folder+f'cooc-matrix_{decade}_N_R.npz')
    aggreg_N_matrix += matrix
        
for decade in tqdm(DECADES,desc='Aggregating V'):
    matrix = sp.load_npz(new_storage_folder+f'cooc-matrix_{decade}_V_L.npz')
    aggreg_V_matrix += matrix
    matrix = sp.load_npz(new_storage_folder+f'cooc-matrix_{decade}_V_R.npz')
    aggreg_V_matrix += matrix
        


Aggregating A: 100%|██████████| 11/11 [00:04<00:00,  2.47it/s]
Aggregating N: 100%|██████████| 11/11 [00:26<00:00,  2.42s/it]
Aggregating V: 100%|██████████| 11/11 [00:04<00:00,  2.64it/s]


In [34]:
nzA = np.sum(aggreg_A_matrix.sum(axis=1)>0)
nzN = np.sum(aggreg_N_matrix.sum(axis=1)>0)
nzV = np.sum(aggreg_V_matrix.sum(axis=1)>0)
print('A :', len(adjs_ind), nzA, len(adjs_ind)-nzA)
print('N :', len(nouns_ind), nzN, len(nouns_ind) - nzN)
print('V :', len(verbs_ind), nzV, len(verbs_ind) - nzV)
print(nzA + nzN + nzV)

A : 6475 6475 0
N : 20650 20650 0
V : 8357 8357 0
35482
