# Longman Communication 3000

In [169]:
import nltk
import re
import pandas as pd
import numpy as np
import csv
from pprint import pprint
from IPython.display import display

## Read file

In [119]:
filename = 'academic.txt'
lines = [x.strip() for x in open(filename).readlines() if len(x.strip()) > 0]
print(len(lines))
pprint(lines[-20:])

1434
['violate v',
 'violation n',
 'virtual adj',
 'virtually adv S2, W2',
 'visibility n',
 'visible adj W3',
 'visibly adv',
 'vision n S3, W2',
 'visual adj W3',
 'visualize v',
 'visually adv',
 'volume n W2',
 'voluntarily adv',
 'voluntary adj W3',
 'volunteer n',
 'volunteer v',
 'welfare n S3, W2',
 'whereas conjunction S2, W2',
 'whereby adv S3',
 'widespread adj W3']


In [120]:
data = [x.replace(',', '').split() for x in lines]
pprint(data[-20:])

[['violate', 'v'],
 ['violation', 'n'],
 ['virtual', 'adj'],
 ['virtually', 'adv', 'S2', 'W2'],
 ['visibility', 'n'],
 ['visible', 'adj', 'W3'],
 ['visibly', 'adv'],
 ['vision', 'n', 'S3', 'W2'],
 ['visual', 'adj', 'W3'],
 ['visualize', 'v'],
 ['visually', 'adv'],
 ['volume', 'n', 'W2'],
 ['voluntarily', 'adv'],
 ['voluntary', 'adj', 'W3'],
 ['volunteer', 'n'],
 ['volunteer', 'v'],
 ['welfare', 'n', 'S3', 'W2'],
 ['whereas', 'conjunction', 'S2', 'W2'],
 ['whereby', 'adv', 'S3'],
 ['widespread', 'adj', 'W3']]


In [121]:
# list of number of items
lni = [len(x) for x in data]
print(lni[-10:])
print(max(lni))
print(min(lni))

[2, 3, 2, 3, 2, 2, 4, 4, 3, 3]
4
2


In [122]:
# list of number of items
lni = {}
for x in data:
    n = len(x)
    if n not in lni:
        lni[n] = 1
    else:
        lni[n] += 1
        
print(lni)

{2: 873, 3: 263, 4: 298}


In [123]:
target = [x for x in data if len(x) == 1]
pprint(target)

[]


In [124]:
# part of speech
pos = [
    'adj', # adjective
    'adv', # adverb
    'auxv', # auxiliary verb
    'conjunction', # conjunction
    'determiner', # determiner
    'interjection', # interjection
    'modv', # modal verb
    'n', # noun
    'number', # number
    'phrv', # phrasal verb
    'predeterminer', # predeterminer
    'prefix', # prefix
    'prep', # preposition
    'pron', # pronoun
    'suffix', # suffix
    'v', # verb
]

In [125]:
res = []
for x in data:
    index = 2
    if x[1] in pos:
        index = 1
    
    word = ' '.join(x[:index])
    S, W = 0, 0
    for p in x[index + 1:]:
        if p[0] == 'S':
            S = int(p[1])
        if p[0] == 'W':
            W = int(p[1])
            
    res.append((word, x[index], S, W))

data = res
pprint(data[-20:])

[('violate', 'v', 0, 0),
 ('violation', 'n', 0, 0),
 ('virtual', 'adj', 0, 0),
 ('virtually', 'adv', 2, 2),
 ('visibility', 'n', 0, 0),
 ('visible', 'adj', 0, 3),
 ('visibly', 'adv', 0, 0),
 ('vision', 'n', 3, 2),
 ('visual', 'adj', 0, 3),
 ('visualize', 'v', 0, 0),
 ('visually', 'adv', 0, 0),
 ('volume', 'n', 0, 2),
 ('voluntarily', 'adv', 0, 0),
 ('voluntary', 'adj', 0, 3),
 ('volunteer', 'n', 0, 0),
 ('volunteer', 'v', 0, 0),
 ('welfare', 'n', 3, 2),
 ('whereas', 'conjunction', 2, 2),
 ('whereby', 'adv', 3, 0),
 ('widespread', 'adj', 0, 3)]


In [126]:
df = pd.DataFrame(data, index=np.arange(1, len(data) + 1), columns=['Word', 'POS', 'Spoken', 'Written'])
display(df)

Unnamed: 0,Word,POS,Spoken,Written
1,abandon,v,0,3
2,abandoned,adj,0,0
3,abnormal,adj,0,0
4,abstract,adj,0,0
5,abstract,n,0,0
6,abstract,v,0,0
7,abstraction,n,0,0
8,academic,adj,0,2
9,academic,n,0,0
10,academy,n,0,0


In [127]:
df.to_csv('academic.csv')

In [128]:
res = {}
for x in data:
    sw = x[2]
    if sw not in res:
        res[sw] = 1
    else:
        res[sw] += 1
    
print(res)
print(sum(res.values()))


{0: 1105, 1: 45, 2: 121, 3: 163}
1434


In [129]:
res = {}
for x in data:
    sw = x[3]
    if sw not in res:
        res[sw] = 1
    else:
        res[sw] += 1
    
print(res)
print(sum(res.values()))

{0: 904, 1: 95, 2: 212, 3: 223}
1434


In [130]:
res = 0
for x in data:
    if x[2] == 0 and x[3] == 0:
        res += 1
        
print(res)

873


In [134]:
data = list(csv.reader(open('comm3000.csv')))
pprint(data[:10])

[['', 'Word', 'POS', 'Spoken', 'Written'],
 ['1', 'a', 'determiner', '1', '1'],
 ['2', 'abandon', 'v', '0', '3'],
 ['3', 'ability', 'n', '2', '1'],
 ['4', 'able', 'adj', '1', '1'],
 ['5', 'about', 'prep', '1', '1'],
 ['6', 'about', 'adv', '1', '1'],
 ['7', 'above', 'adv', '2', '1'],
 ['8', 'above', 'adj', '0', '3'],
 ['9', 'abroad', 'adv', '2', '3']]


In [158]:
df = pd.DataFrame.from_csv('communication.csv', header=None, index_col=None)
display(df)

Unnamed: 0,0,1,2
0,a,1,1
1,abandon,0,3
2,ability,2,1
3,able,1,1
4,about,1,1
5,about,1,1
6,above,2,1
7,above,0,3
8,abroad,2,3
9,absence,3,2


In [148]:
df.to_csv('academic.csv', columns=['Word'], index=False, header=False)

In [165]:
lines = [l.strip().split()[0] for l in open('defining.txt').readlines()]
lines = [l for l in lines if len(l) > 1 and '.' not in l]
pprint(lines[:])

['abbreviation',
 'ability',
 'able',
 'about',
 'above',
 'abroad',
 'absence',
 'absent',
 'accept',
 'acceptable',
 'accident',
 'according',
 'account',
 'achieve',
 'acid',
 'across',
 'act',
 'action',
 'active',
 'activity',
 'actor,',
 'actual',
 'actually',
 'add',
 'addition',
 'address',
 'adjective',
 'admiration',
 'admire',
 'admit',
 'adult',
 'advanced',
 'advantage',
 'adventure',
 'adverb',
 'advertise',
 'advertisement',
 'advice',
 'advise',
 'affair',
 'affect',
 'afford',
 'afraid',
 'after',
 'afternoon',
 'afterward',
 'again',
 'against',
 'age',
 'ago',
 'agree',
 'agreement',
 'ahead',
 'aim',
 'air',
 'airplane',
 'airport',
 'alcohol',
 'alive',
 'all',
 'determiner',
 'allow',
 'almost',
 'alone',
 'along',
 'alphabet',
 'already',
 'also',
 'although',
 'always',
 'among',
 'amount',
 'amuse',
 'amusement',
 'amusing',
 'an',
 'ancient',
 'and',
 'anger',
 'angle',
 'angry',
 'animal',
 'announce',
 'announcement',
 'annoy',
 'another',
 'answer',
 'anxie

In [168]:
with open('defining2.txt', 'w') as f:
    for l in lines:
        f.write(l + '\n')

'violate v',
 'violation n',
 'virtual adj',
 'virtually adv S2, W2',
 'visibility n',

In [174]:
nltk.pos_tag(['virtually'])

[('virtually', 'RB')]

In [183]:
nltk.help.upenn_tagset('NN.*')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...
