# Longman Communication 3000

In [78]:
import re
import pandas as pd
import numpy as np
from pprint import pprint
from IPython.display import display

## Read file

In [58]:
filename = 'comm3000.txt'
lines = [x.strip() for x in open(filename).readlines() if len(x.strip()) > 0]
print(len(lines))
pprint(lines[-20:])

3583
['writing n S2, W3',
 'wrong adj S1, W1',
 'wrong adv S2',
 'yard n S2, W2',
 'yeah adv S1',
 'year n S1, W1',
 'yellow adj S2, W3',
 'yep adv S1',
 'yes adv S1, W1',
 'yesterday adv S1, W1',
 'yet adv S1, W1',
 'yet conjunction W2',
 'you pron S1, W1',
 'young adj S1, W1',
 'youngster n S3',
 'your determiner S1, W1',
 'yours pron S1, W3',
 'yourself pron S1, W2',
 'youth n S2, W2',
 'zone n W3']


In [59]:
data = [x.replace(',', '').split() for x in lines]
pprint(data[-20:])

[['writing', 'n', 'S2', 'W3'],
 ['wrong', 'adj', 'S1', 'W1'],
 ['wrong', 'adv', 'S2'],
 ['yard', 'n', 'S2', 'W2'],
 ['yeah', 'adv', 'S1'],
 ['year', 'n', 'S1', 'W1'],
 ['yellow', 'adj', 'S2', 'W3'],
 ['yep', 'adv', 'S1'],
 ['yes', 'adv', 'S1', 'W1'],
 ['yesterday', 'adv', 'S1', 'W1'],
 ['yet', 'adv', 'S1', 'W1'],
 ['yet', 'conjunction', 'W2'],
 ['you', 'pron', 'S1', 'W1'],
 ['young', 'adj', 'S1', 'W1'],
 ['youngster', 'n', 'S3'],
 ['your', 'determiner', 'S1', 'W1'],
 ['yours', 'pron', 'S1', 'W3'],
 ['yourself', 'pron', 'S1', 'W2'],
 ['youth', 'n', 'S2', 'W2'],
 ['zone', 'n', 'W3']]


In [60]:
# list of number of items
lni = [len(x) for x in data]
print(lni[-10:])
print(max(lni))
print(min(lni))

[4, 3, 4, 4, 3, 4, 4, 4, 4, 3]
5
3


In [61]:
# list of number of items
lni = {}
for x in data:
    n = len(x)
    if n not in lni:
        lni[n] = 1
    else:
        lni[n] += 1
        
print(lni)

{3: 1173, 4: 2399, 5: 11}


In [62]:
target = [x for x in data if len(x) == 5]
pprint(target)

[['according', 'to', 'prep', 'S2', 'W1'],
 ['all', 'right', 'adj', 'S1', 'W2'],
 ['credit', 'card', 'n', 'S3', 'W3'],
 ['each', 'other', 'pron', 'S1', 'W1'],
 ['mobile', 'phone', 'n', 'S2', 'W3'],
 ['no', 'one', 'pron', 'S1', 'W2'],
 ['of', 'course', 'adv', 'S1', 'W1'],
 ['one', 'another', 'pron', 'S3', 'W3'],
 ['ought', 'to', 'modv', 'S1', 'W1'],
 ['round', 'adv', 'prep', 'S2', 'W2'],
 ['used', 'to', 'modv', 'S1', 'W2']]


In [68]:
# part of speech
pos = [
    'adj', # adjective
    'adv', # adverb
    'auxv', # auxiliary verb
    'conjunction', # conjunction
    'determiner', # determiner
    'interjection', # interjection
    'modv', # modal verb
    'n', # noun
    'number', # number
    'phrv', # phrasal verb
    'predeterminer', # predeterminer
    'prefix', # prefix
    'prep', # preposition
    'pron', # pronoun
    'suffix', # suffix
    'v', # verb
]

In [82]:
res = []
for x in data:
    index = 2
    if x[1] in pos:
        index = 1
    
    word = ' '.join(x[:index])
    S, W = 0, 0
    for p in x[index + 1:]:
        if p[0] == 'S':
            S = int(p[1])
        if p[0] == 'W':
            W = int(p[1])
            
    res.append((word, x[index], S, W))

data = res
pprint(data[-20:])

[('writing', 'n', 2, 3),
 ('wrong', 'adj', 1, 1),
 ('wrong', 'adv', 2, 0),
 ('yard', 'n', 2, 2),
 ('yeah', 'adv', 1, 0),
 ('year', 'n', 1, 1),
 ('yellow', 'adj', 2, 3),
 ('yep', 'adv', 1, 0),
 ('yes', 'adv', 1, 1),
 ('yesterday', 'adv', 1, 1),
 ('yet', 'adv', 1, 1),
 ('yet', 'conjunction', 0, 2),
 ('you', 'pron', 1, 1),
 ('young', 'adj', 1, 1),
 ('youngster', 'n', 3, 0),
 ('your', 'determiner', 1, 1),
 ('yours', 'pron', 1, 3),
 ('yourself', 'pron', 1, 2),
 ('youth', 'n', 2, 2),
 ('zone', 'n', 0, 3)]


In [83]:
df = pd.DataFrame(data, index=np.arange(1, len(data) + 1), columns=['Word', 'POS', 'Spoken', 'Written'])
display(df)

Unnamed: 0,Word,POS,Spoken,Written
1,a,determiner,1,1
2,abandon,v,0,3
3,ability,n,2,1
4,able,adj,1,1
5,about,prep,1,1
6,about,adv,1,1
7,above,adv,2,1
8,above,adj,0,3
9,abroad,adv,2,3
10,absence,n,3,2


In [81]:
df.to_csv('comm3000.csv')

In [97]:
res = {}
for x in data:
    sw = x[2]
    if sw not in res:
        res[sw] = 1
    else:
        res[sw] += 1
    
print(res)
print(sum(res.values()))


{0: 595, 1: 993, 2: 1000, 3: 995}
3583


In [98]:
res = {}
for x in data:
    sw = x[3]
    if sw not in res:
        res[sw] = 1
    else:
        res[sw] += 1
    
print(res)
print(sum(res.values()))

{0: 588, 1: 998, 2: 1001, 3: 996}
3583


In [94]:
res = 0
for x in data:
    if x[2] != 0 and x[3] != 0:
        res += 1
        
print(res)

2403
