In [6]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
import string
from ftfy import fix_text

In [67]:
class Analyze(object):
    """Object that aids in the analysis of text from different sources"""
    
    def __init__(self, text_file_path):
        self.stop_words = self._get_stop_words()
        self.raw_text = self._get_raw_text(text_file_path)
        self.text = self.process(self.raw_text)
        
    def _get_raw_text(self, text_file_path):
        with open(text_file_path, 'r') as f:
            raw_text = [line.decode('utf-8').strip() for line in f.readlines()]
        return raw_text
        
    def _get_stop_words(self):
        stop_words = stopwords.words('english')
        stop_words.extend(list(string.punctuation))
        stop_words.extend(list(string.whitespace))
        stop_words.append('...')
        return stop_words
    
    def process(self, text):
        raw_text = map(fix_text, text)
        return raw_text
    
    def get_words(self):
        words_nested = [word_tokenize(x) for x in self.text]
        words = [word.lower() for li in words_nested for word in li]
        words = filter(lambda x: x not in self.stop_words, words)
        words = set(words)
        return words
    
    def get_sentences(self):
        pass
    
    def get_stems(self):
        words = self.get_words()
        ps = PorterStemmer()
        stems = set([ps.stem(word) for word in words])
        return stems
            
    def get_parts_of_speech(self):
        sent_tokenizer = PunktSentenceTokenizer()
        tokenized = sent_tokenizer.tokenize(str(self.raw_text))
        tagged_words = []
        try:
            for i in tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                tagged_words.append(tagged)
        except:
            pass

In [68]:
a = Analyze('speeches.txt')

In [46]:
words = a.get_words()

In [47]:
stems = a.get_stems()

In [66]:
type(a.raw_text)

list

In [None]:
a.get_parts_of_speech()

In [48]:
len(stems), len(words)

(4088, 5671)

In [23]:
words

[u'thank',
 u'much',
 u"'s",
 u'nice',
 u"n't",
 u'great',
 u'guy',
 u"n't",
 u'get',
 u'fair',
 u'press',
 u"n't",
 u'get',
 u"'s",
 u'fair',
 u'tell',
 u"'m",
 u'strongly',
 u'great',
 u'respect',
 u'steve',
 u'king',
 u'great',
 u'respect',
 u'likewise',
 u'citizens',
 u'united',
 u'david',
 u'everybody',
 u'tremendous',
 u'resect',
 u'tea',
 u'party',
 u'also',
 u'also',
 u'people',
 u'iowa',
 u'something',
 u'common',
 u'hard-working',
 u'people',
 u'want',
 u'work',
 u'want',
 u'make',
 u'country',
 u'great',
 u'love',
 u'people',
 u'iowa',
 u"'s",
 u'way',
 u'simple',
 u'said',
 u'country',
 u'really',
 u'headed',
 u'wrong',
 u'direction',
 u'president',
 u'absolutely',
 u'terrible',
 u'job',
 u'world',
 u'collapsing',
 u'around',
 u'us',
 u'many',
 u'problems',
 u"'ve",
 u'caused',
 u'president',
 u'either',
 u'grossly',
 u'incompetent',
 u'word',
 u'people',
 u'using',
 u'think',
 u'first',
 u'use',
 u'completely',
 u'different',
 u'agenda',
 u'want',
 u'know',
 u'could',
 u'p

In [30]:
from nltk.stem import PorterStemmer

In [32]:
ps = PorterStemmer()

In [33]:
example_words = ['python', 'pythoner', 'pythoning','pythonly']

In [34]:
for w in example_words:
    print ps.stem(w)

python
python
python
pythonli


In [35]:
new_text = "it is very important to be pythonly while pythoning. all phytoners have pythonevd at some point wrong"

In [None]:
words = word_tokenize(new_text)

In [49]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [53]:
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2005-GWBush.txt")

In [59]:
custom_sent_tokenizer = PunktSentenceTokenizer()

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            print tagged
            
    except Exception as e:
        print str(e)

In [62]:
process_content()

[(u'PRESIDENT', 'NNP'), (u'GEORGE', 'NNP'), (u'W.', 'NNP'), (u'BUSH', 'NNP'), (u"'S", 'POS'), (u'ADDRESS', 'NNP'), (u'BEFORE', 'NNP'), (u'A', 'NNP'), (u'JOINT', 'NNP'), (u'SESSION', 'NNP'), (u'OF', 'NNP'), (u'THE', 'NNP'), (u'CONGRESS', 'NNP'), (u'ON', 'NNP'), (u'THE', 'NNP'), (u'STATE', 'NNP'), (u'OF', 'NNP'), (u'THE', 'NNP'), (u'UNION', 'NNP'), (u'February', 'NNP'), (u'2', 'CD'), (u',', ','), (u'2005', 'CD'), (u'9:10', 'CD'), (u'P.M', 'JJ'), (u'.', '.')]
[(u'EST', 'JJS'), (u'THE', 'DT'), (u'PRESIDENT', 'NNP'), (u':', ':'), (u'Mr.', 'NNP'), (u'Speaker', 'NNP'), (u',', ','), (u'Vice', 'NNP'), (u'President', 'NNP'), (u'Cheney', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'Congress', 'NNP'), (u',', ','), (u'fellow', 'JJ'), (u'citizens', 'NNS'), (u':', ':'), (u'As', 'IN'), (u'a', 'DT'), (u'new', 'JJ'), (u'Congress', 'NNP'), (u'gathers', 'NNS'), (u',', ','), (u'all', 'DT'), (u'of', 'IN'), (u'us', 'PRP'), (u'in', 'IN'), (u'the', 'DT'), (u'elected', 'VBN'), (u'branches', 'NNS'