## Data
* https://archive.org/download/stackexchange

In [1]:
# postFile = "../StackExchangedData/3dprinting.stackexchange.com/Posts.xml"
# postFile = "../StackExchangedData/ai.stackexchange.com/Posts.xml"
# postFile = "../StackExchangedData/anime.stackexchange.com/Posts.xml"
postFile = "../StackExchangedData/askubuntu.com/Posts.xml"

In [2]:
import xml.etree.ElementTree
e = xml.etree.ElementTree.parse(postFile).getroot()

In [3]:
titles = []
for atype in e.findall('row'):
    title = atype.get('Title')
    if title is not None:
        titles.append(title)
        
print(len(titles))

248641


In [4]:
alltitles = " ".join(titles).lower()

In [5]:
print(alltitles[:1000])

how to get the "your battery is broken" message to go away? how can i set the software center to install software for non-root users? what are some alternatives to upgrading without using the standard upgrade system? how to set up a headless server? how do i run a successful ubuntu hour? how do i go back to kde splash / login after installing xfce? how do i enable automatic updates? how do i install adobe flash player? how can i make ubuntu check for updates less often? what might prevent mouse movements between xrandr screens? where should i install sagemath? remove online status menu, but keep the logout menu? sane path to distribution upgrades what is the easiest way to strip a desktop edition to a server edition? what's the easiest way to set up a lamp stack? what is the performance loss if you run an ubuntu desktop edition for a server machine? when installing i'm given the option of encrypting my home folder -- what does this do? how to configure mail server to report a hostname 

In [6]:
import nltk
#nltk.download()

In [7]:
# POS tagger
tokens = nltk.word_tokenize(alltitles)

In [8]:
# Tokenized Text
print(tokens[:10])

['how', 'to', 'get', 'the', '``', 'your', 'battery', 'is', 'broken', "''"]


In [9]:
# Tagged Text
tagged_text = nltk.pos_tag(tokens)
print(tagged_text[:15])

[('how', 'WRB'), ('to', 'TO'), ('get', 'VB'), ('the', 'DT'), ('``', '``'), ('your', 'PRP$'), ('battery', 'NN'), ('is', 'VBZ'), ('broken', 'VBN'), ("''", "''"), ('message', 'NN'), ('to', 'TO'), ('go', 'VB'), ('away', 'RB'), ('?', '.')]


In [10]:
#Documentation
nltk.help.upenn_tagset('NN.*')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


Homonyms: each of two or more words having the same spelling but different meanings and origins (e.g., pole1 and pole2); a homograph.

* Over - Preposition
* The - Determiner
* A - Determiner
* And - Coordinating Conjunction


In [11]:
# Convert tokens to NLTK Text
text = nltk.Text(tokens)

In [12]:
# Find similar words
text.similar("network")

ubuntu wifi windows file internet server usb wireless system desktop
partition connection the terminal folder boot update files install
directory


In [13]:
text.similar("the")

a ubuntu my windows an to in on install unity no this terminal and any
all linux one gnome new


In [14]:
text.similar("artificial")

about


In [15]:
sample_tagged_token = nltk.tag.str2tuple('fly/NN')
print(sample_tagged_token)

('fly', 'NN')


In [16]:
# Most common tags in the text
tag_fd = nltk.FreqDist(tag for (word, tag) in tagged_text)
tag_fd.keys()

dict_keys(['JJ', 'CD', 'VBP', '$', 'WRB', 'RBR', 'JJS', 'FW', 'RBS', 'NN', 'EX', ')', '``', 'PDT', 'RB', 'VBN', 'WP$', '#', '(', 'NNP', ':', ',', 'IN', 'TO', 'UH', 'VBG', 'VBD', 'NNS', '.', 'JJR', 'WP', 'LS', 'SYM', 'RP', 'NNPS', 'POS', 'CC', 'VBZ', 'VB', 'PRP', 'MD', 'WDT', 'DT', "''", 'PRP$'])

In [17]:
#tag_fd.plot()

* Adjectives as modifiers (The large pizza)
* As predicates (The pizza is large)

In [18]:
#nltk.app.concordance()

In [19]:
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith('NN'))
print(cfd.conditions())
nounConditions = cfd.conditions()

['NNPS', 'NN', 'NNP', 'NNS']


In [20]:
cfd['NNS'].keys()
print(list(cfd['NNS'].keys())[:10])

['jails', 'e-mails', 'fglrx-updates', 'rsa', 'extra_cflags', 'comboboxes', '/redmine/files', 'applicaitons', 'drives/folders', 'guests']


In [21]:
# Find the words with the given tag
def findtags(tag_prefix, tagged_text, retCount):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix))
    return dict((tag, list(cfd[tag].keys())[:retCount]) for tag in cfd.conditions())

In [22]:
tagdict = findtags('NN', tagged_text, 10)
print(tagdict)

{'NNPS': ["'requires", 'broadcom', 'communications', "'servers", '.acsm', 'x64', '||', "'volumes", '/usr/share/themes'], 'NN': ['p900', 'desktop-amd64+mac', 'assumption', 'pulseaudio-equaliser', 'hid', 'vera', 'pear', 'gtx660', 'sitespeed.io', 'pixelation'], 'NNP': ["'ip", '/dev/sda3', 'x770', '/proc/stat', "'export", '-lxt', '`/var/lib/dpkg/updates/', "'adduser", '\\efi\\boot\\grubx64.efi', '/etc/.initramfs'], 'NNS': ['jails', 'e-mails', 'fglrx-updates', 'rsa', 'extra_cflags', 'comboboxes', '/redmine/files', 'applicaitons', 'drives/folders', 'guests']}


In [23]:
# Three word phrases
def process(sentence):
    retVal = []
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            retVal.append((w1, w2, w3))
    return retVal

print(process(tagged_text)[:10])

[('rollback', 'to', 'php'), ('guaranteed', 'to', 'work'), ('left', 'to', 'right'), ('attached', 'to', 'clock'), ('prompted', 'to', 'unlock'), ('needed', 'to', 'use'), ('connect', 'to', 'reliance'), ('connect', 'to', 'exchange'), ('supposed', 'to', 'read'), ('seems', 'to', 'block')]


In [24]:
# Find the most common Nouns and proper Nouns
word_tag_fd = nltk.FreqDist(tagged_text)

# http://www.nltk.org/book/ch05.html
for cond in nounConditions:
    print(cond, [wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == cond][:150])

NNPS ['x64', '/usr/share/themes', 'broadcom', "'requires", "'servers", "'volumes", '||', '.acsm', 'communications']
NN ['error', 'file', 'ubuntu', 'boot', 'problem', 'i', 'screen', 'server', 'system', 'installation', 'command', 'drive', 'partition', 'desktop', 'network', 'install', 'software', 'unity', 'package', 'access', 'driver', 'version', 'connection', 'script', 'grub', 'disk', 'directory', 'wifi', 'keyboard', 'folder', 'default', 'wireless', 'login', 'application', 'way', 'password', 'menu', 'time', 'manager', 'terminal', 'user', 'computer', 'monitor', 'card', 'sound', 'line', 'window', 'upgrade', 'help', 'video', 'program', 'mouse', 'mode', 'internet', 'issue', 'device', 'machine', 'gnome', 'root', 'dell', 'display', 'shell', 'use', 'home', 'linux', 'intel', 'virtualbox', 'icon', 'space', 'update', 'change', 'work', 'output', 'resolution', 'center', 'pc', 'laptop', 'image', 'cd', 'kernel', 'startup', 'text', 'option', 'firefox', 'usb', 'os', 'wine', 'list', 'size', 'name', 'pane

In [25]:
fd = nltk.FreqDist(tagged_text)
fd.most_common(5)

[(('to', 'TO'), 84648),
 (('?', '.'), 80203),
 (('ubuntu', 'JJ'), 56792),
 (('how', 'WRB'), 53649),
 (('in', 'IN'), 41035)]

In [26]:
# Adjectives
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith("JJ"))
adjConditions = cfd.conditions()

In [27]:
for adjCond in adjConditions:
    print(adjCond, [wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == adjCond][:10])

JJ ['ubuntu', 'usb', 'unable', 'dual', 'new', 'install', 'i', 'terminal', 'hard', 'update']
JJR ['more', 'older', 'launcher', 'better', 'newer', 'less', 'slower', 'higher', 'smaller', 'larger']
JJS ['latest', 'best', 'guest', 'newest', 'most', 'easiest', 'least', 'fastest', 'lastest', 'simplest']


In [28]:
word_tag_pairs = nltk.bigrams(tagged_text)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NN']
fdist = nltk.FreqDist(noun_preceders)
tags1 = [tag for (tag, _) in fdist.most_common()]
print(tags1)

['NN', 'JJ', 'IN', 'DT', 'VB', 'CD', '.', 'VBG', 'CC', 'PRP$', ':', 'NNS', 'VBN', 'VBP', 'NNP', '(', ')', ',', "''", 'WRB', '``', 'RB', 'VBZ', 'TO', 'POS', 'VBD', 'JJS', 'JJR', 'RP', 'WDT', 'WP', 'FW', 'MD', '$', 'PRP', 'RBR', 'EX', 'RBS', '#', 'UH', 'WP$', 'NNPS']


In [29]:
for k,v in fdist.most_common():
    print(k,v)

NN 139336
JJ 125298
IN 64074
DT 53558
VB 28964
CD 22498
. 17663
VBG 16906
CC 9527
PRP$ 9172
: 8446
NNS 5763
VBN 4189
VBP 4063
NNP 3757
( 3752
) 3552
, 3366
'' 3350
WRB 3250
`` 2968
RB 2907
VBZ 2333
TO 2119
POS 2105
VBD 1958
JJS 1595
JJR 1136
RP 1059
WDT 835
WP 811
FW 612
MD 261
$ 163
PRP 143
RBR 127
EX 36
RBS 30
# 25
UH 14
WP$ 10
NNPS 4


In [30]:
# Most common JJ+NN pairs
word_tag_pairs = nltk.bigrams(tagged_text)
JJbeforeNN = [(a,b) for (a, b) in word_tag_pairs if b[1] == 'NN' and a[1] == 'JJ']
fdist = nltk.FreqDist(JJbeforeNN)
print(fdist.most_common(10))

[((('dual', 'JJ'), ('boot', 'NN')), 2133), ((('ubuntu', 'JJ'), ('server', 'NN')), 1574), ((('hard', 'JJ'), ('drive', 'NN')), 1427), ((('black', 'JJ'), ('screen', 'NN')), 1145), ((('ubuntu', 'JJ'), ('touch', 'NN')), 728), ((('ubuntu', 'JJ'), ('software', 'NN')), 681), ((('ubuntu', 'JJ'), ('installation', 'NN')), 560), ((('usb', 'JJ'), ('drive', 'NN')), 521), ((('hard', 'JJ'), ('disk', 'NN')), 499), ((('ubuntu', 'JJ'), ('desktop', 'NN')), 476)]
