Questions:

- For POS tagging, should I used filtered list or tokens? (Different results)
        #self.pos_tags = self.parts_of_speech_tagging(self.filtered_text)
- Do I keep ANP? The text file is pretty bad. 

In [472]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk import pos_tag

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt


class Manifesto(object):
    '''
    Creates a Manifesto object.
    Can be used to assess different aspects of a text file
        including tokens, most common words, parts of speech.
    Can be used to create word clouds.
    '''
    
    
    
    def __init__(self, file_path):
        '''
        Initializes the object variables.
        '''
        self.text = self.reading_file (file_path)
        self.tokens = self.tokenize (self.text)
        self.filtered_text = self.preprocessing (self.tokens)
        self.stemmed_list = self.stemmer(self.filtered_text)
        self.pos_tags = self.parts_of_speech_tagging(self.tokens)
        self.word_frequency = self.finding_word_frequency(self.filtered_text)
        
    
    def reading_file (self, file_path):
        '''
        Given a file path, checks if file exists there, reads it, closes it,
            and returns the text as a string.

        Input:
            file_path: string

        Return:
            text: string
        '''
        assert os.path.exists(file_path), "File not found at: "+str(file_path)
        f = open(file_path,'r')    
        text = f.read()
        f.close()
        return text
        
        
    def tokenize (self, text):
        '''
        Given some text, will return tokens of that text
        
        Input: 
            text: string
        Output:
            token: list of string
        '''
        tokens = nltk.word_tokenize(text)
        return tokens
    
    
        
    def finding_word_frequency (self, filtered_text):
        '''
        Given a list, returns the word frequency
        
        Input:
            filtered_text: list of strings
        Output:
            word frequency: nltk.probability.FreqDist
        '''
        word_frequency = nltk.FreqDist(filtered_text)
        return word_frequency
        
        
    def parts_of_speech_tagging(self, tokens):
        '''
        Given tokens, assigns parts of speech to each token
        
        '''
        tagged = nltk.pos_tag(tokens)
        return tagged      
        
        
    def stemmer (self, filtered_text):
        '''
        Does stemming/lemmatization of a given text
        Input:
            filtered_text: list of string
            
        Output:
            a set of stemmed words
        '''
        st = RSLPStemmer()
        stemmed_list = set(st.stem(token) for token in filtered_text)
        return stemmed_list
        
        
        
    def preprocessing (self, text):
        '''
        Removes stop words and converts to lower case.
        
        Input:
            text: string
            
        Output:
            filtered_text: list of string
        '''
        stop_words = set(stopwords.words('english'))
        words=[word.lower() for word in text if word.isalpha()]
        filtered_text = [w for w in words if not w in stop_words]
        return filtered_text

    
        
    def find_most_frequent_words(self, number):
        '''
        For a given manifesto object, returns the most common X number of words used
            along with the count
            
        Input:
            number: integer
            
        Output:
            mostcommon: list
        '''
        wordfreqdist = nltk.FreqDist(self.filtered_text)
        mostcommon = wordfreqdist.most_common(number)
        return mostcommon
        
        
    def create_wordcloud(self, title = None):
        '''
        Creates a word cloud based on the text of the file.
        Removes stop words (which consists of conventional stop words  
                and words from my own list)
            
        Special thanks to the community at stackoverflow
        (https://stackoverflow.com/questions/16645799/how-to-create-a-word-cloud-from-a-corpus-in-python)
        for this one!
        '''
        stop_words = list(STOPWORDS)
        personal_list = ['pakistan', 'people', 'party', 'manifesto', 'government', 'per', 
                        'cent', 'will', 'Parliamentarians', 'ANP', 'MQM', 'iii', 'i', 'ii', 'iv', 'v', 'vi', 'PML', 'PTI' ]
        stop_words_2 = set(stop_words + personal_list)


        wordcloud = WordCloud(
            background_color='white',
            stopwords=stop_words_2,
            max_words=100,
            scale=3,
            max_font_size=40
        ).generate(str(self.text))

        fig = plt.figure(1, figsize=(20, 20), dpi = 400)
        plt.axis('off')
        if title: 
            fig.suptitle(title, fontsize=30)
            fig.subplots_adjust(top=2.3)

        plt.imshow(wordcloud)
        plt.show()
        
        
        
    def __repr__(self):
        
        return 'You have stumbled onto a gold mine!'

In [473]:
# CREATING MANIFESTO OBJECTS OF THE POLITICAL PARTIES

ppp = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/PPP_2013.txt')
pmln = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/PMLN_2013.txt')
mqm = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/MQM_2013.txt')
pti = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/PTI_2013.txt')
anp = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/ANP_2013.txt')
ji = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/JI_2013.txt')

"\npmln = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/PMLN_2013.txt')\nmqm = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/MQM_2013.txt')\npti = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/PTI_2013.txt')\nanp = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/ANP_2013.txt')\nji = Manifesto ('/Users/kazi/Desktop/Manifesto Text Files/JI_2013.txt')\n"