In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
deriv_data = pd.read_csv('../data/fra.derivational.v1.tsv', sep = '\t', header = None)
deriv_data.columns = ['form1', 'form2', 'form1_pos', 'form2_pos', 'morpheme', 'type']

In [3]:
deriv_data = deriv_data.query('form1_pos != form2_pos')

In [4]:
# count transitions from form1_pos to form2_pos
transitions = deriv_data.groupby(['form1_pos', 'form2_pos'])['form1'].count()
# convert to percentages inside each form1_pos group
transitions = transitions.groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).apply(lambda x: round(x, 2))
# delete first level of index
transitions = transitions.reset_index(level=0, drop=True)

In [5]:
transitions

form1_pos  form2_pos
J          N            51.91
           R            35.70
           U             0.01
           V            12.38
N          J            72.24
           R             1.10
           U             0.17
           V            26.50
R          J            14.71
           N            55.88
           U             2.94
           V            26.47
U          J            14.63
           N            63.41
           R             4.88
           V            17.07
V          J            22.48
           N            77.37
           R             0.13
           U             0.01
Name: form1, dtype: float64

In [6]:
# count transitions from form1_pos to form2_pos in deriv_data
deriv_data['transition'] = deriv_data['form1_pos'] + ' -> ' + deriv_data['form2_pos']

In [7]:
deriv_data['transition'].value_counts()

transition
V -> N    10971
N -> J    10404
J -> N     3942
N -> V     3816
V -> J     3188
J -> R     2711
J -> V      940
N -> R      158
U -> N       26
N -> U       24
R -> N       19
V -> R       18
R -> V        9
U -> V        7
U -> J        6
R -> J        5
U -> R        2
V -> U        2
R -> U        1
J -> U        1
Name: count, dtype: int64

In [10]:
# compute the number of times each pos is either a source or a target
source_counts = deriv_data['form1_pos'].value_counts()
target_counts = deriv_data['form2_pos'].value_counts()



In [11]:
source_counts

form1_pos
N    14402
V    14179
J     7594
U       41
R       34
Name: count, dtype: int64

In [12]:
target_counts

form2_pos
N    14958
J    13603
V     4772
R     2889
U       28
Name: count, dtype: int64