In [52]:
import re
from difflib import SequenceMatcher

class TextAnalyzer:
    __text_list = []
    __raw_text_list = []
    __stopwords = []
    __frequency_list = []
    
    def __init__(self, text):
        self.__text_list = self.__cleanText(text)
        self.__raw_text_list = text.split(" ")
        self.__stopwords = self.__getStopWords()
    
    def __cleanText(self, text):
        clean_text = text.replace('\n', ' ')
        formated_text = " ".join(re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', clean_text).split())
        return formated_text.split(" ")
    
    def __getStopWords(self):
        stopWords = list(open("stopwords.txt").readlines())
        return [word.replace('\n', '') for word in stopWords]
    
    def returnStringList(self):
        return self.__text_list
            
    def getFrequency(self):
        if len(self.__frequency_list) == 0:
            lower_list = [word.lower() for word in self.__text_list[:1000]]
            for i in set(lower_list):
                frequency = lower_list.count(i)
                self.__frequency_list.append( (i.upper(),frequency) )
                self.__frequency_list.sort()
        return self.__frequency_list
    
    def getTop10Frequency(self):
        frequencylist = sorted(self.getFrequency(), reverse=True, key=lambda x: x[1])
        top10 = frequencylist[0:10]
        return top10
    
    def frequencyMeanAndStdDeviation(self):
        count = len(self.getFrequency())
        listValues = [freq[1] for freq in self.__frequency_list]
        total = sum(listValues)
        mean = total/count
        listValues.sort()

        if count % 2 == 0:
            ind1 = int(count / 2 - 1)
            ind2 = int(count / 2 + 1)
            mediana = sum(listValues[ind1:ind2])/2
        else:
            ind = int(count / 2)
            mediana = listValues[ind]
        
        print("Média: %.2f, Desvio padrão: %.2f" %(mean,mediana))
        
    def showStopWords(self):
        return self.__stopwords
    
    def createFileWithoutStopWords(self):
        arquivo = open("filewithoutstopwords.txt", "w")
#         for word in self.__stopwords:
#             text = text.replace(word,'')
        newfile = [word for word in self.__raw_text_list if word.lower() not in self.__stopwords]
        text = ' '.join(newfile)
        
        arquivo.write(text)
        arquivo.close()
        
    def similarityWords(self, word1, word2):
        result = SequenceMatcher(None, word1, word2).ratio()
        return result
    
    def ownSimilarityWords(self, word1, word2):
        simi = 0
        for i in word1:
            if word2.count(i) > 0:
                simi += 1 
        leng = len(word1) + len(word2)    
        result = (2*simi)/leng
        return result
        

In [53]:
arquivo = open("shakespeare.txt", "r")
conteudo = arquivo.read()
arquivo.close()
text1 = TextAnalyzer(conteudo)

In [3]:
#Retorne individualmente cada palavra do texto
text1.returnStringList()

['1609',
 'THE',
 'SONNETS',
 'by',
 'William',
 'Shakespeare',
 '1',
 'From',
 'fairest',
 'creatures',
 'we',
 'desire',
 'increase',
 'That',
 'thereby',
 'beautys',
 'rose',
 'might',
 'never',
 'die',
 'But',
 'as',
 'the',
 'riper',
 'should',
 'by',
 'time',
 'decease',
 'His',
 'tender',
 'heir',
 'might',
 'bear',
 'his',
 'memory:',
 'But',
 'thou',
 'contracted',
 'to',
 'thine',
 'own',
 'bright',
 'eyes',
 'Feedst',
 'thy',
 'lights',
 'flame',
 'with',
 'selfsubstantial',
 'fuel',
 'Making',
 'a',
 'famine',
 'where',
 'abundance',
 'lies',
 'Thy',
 'self',
 'thy',
 'foe',
 'to',
 'thy',
 'sweet',
 'self',
 'too',
 'cruel:',
 'Thou',
 'that',
 'art',
 'now',
 'the',
 'worlds',
 'fresh',
 'ornament',
 'And',
 'only',
 'herald',
 'to',
 'the',
 'gaudy',
 'spring',
 'Within',
 'thine',
 'own',
 'bud',
 'buriest',
 'thy',
 'content',
 'And',
 'tender',
 'churl',
 'makst',
 'waste',
 'in',
 'niggarding:',
 'Pity',
 'the',
 'world',
 'or',
 'else',
 'this',
 'glutton',
 'be',
 

In [4]:
#Conte a quantidade de ocorrências de cada palavra do texto
text1.getFrequency()

[('1', 1),
 ('10', 1),
 ('1609', 1),
 ('2', 1),
 ('3', 1),
 ('4', 1),
 ('5', 1),
 ('6', 1),
 ('7', 1),
 ('8', 1),
 ('9', 1),
 ('A', 7),
 ('ABUNDANCE', 1),
 ('ABUSE', 1),
 ('ACCEPTABLE', 1),
 ('ADORE', 1),
 ('AGE', 3),
 ('AH', 1),
 ('ALL', 3),
 ('ALLEATING', 1),
 ('ALONE', 1),
 ('AN', 3),
 ('AND', 22),
 ('ANNOY', 1),
 ('ANOTHER', 4),
 ('ANSWER', 1),
 ('APRIL', 1),
 ('ARE', 2),
 ('ART', 5),
 ('AS', 1),
 ('ASKED', 1),
 ('ATTENDING', 1),
 ('AUDIT', 1),
 ('BACK', 1),
 ('BARENESS', 1),
 ('BE', 14),
 ('BEAR', 1),
 ('BEAR:', 1),
 ('BEAUTEOUS', 1),
 ('BEAUTY', 6),
 ('BEAUTYS', 7),
 ('BEGUILE', 1),
 ('BEHIND', 1),
 ('BEING', 3),
 ('BEQUEST', 1),
 ('BEREFT', 1),
 ('BESIEGE', 1),
 ('BLOOD', 1),
 ('BOSOM', 1),
 ('BOUNTEOUS', 1),
 ('BREED', 1),
 ('BRIGHT', 1),
 ('BROW', 1),
 ('BUD', 1),
 ('BURIEST', 1),
 ('BURNING', 1),
 ('BUT', 10),
 ('BY', 7),
 ('CALLS', 2),
 ('CANST', 2),
 ('CAR', 1),
 ('CHECKED', 1),
 ('CHIDE', 1),
 ('CHILD', 2),
 ('CHILDRENS', 1),
 ('CHURL', 1),
 ('CLIMBED', 1),
 ('COLD', 1),
 

In [5]:
#Retorne as 10 palavras mais frequentes
text1.getTop10Frequency()

[('THE', 32),
 ('THOU', 31),
 ('THY', 29),
 ('AND', 22),
 ('TO', 22),
 ('IN', 19),
 ('BE', 14),
 ('OF', 14),
 ('THAT', 13),
 ('THEE', 13)]

In [6]:
#Mostrando as stopwords
text1.showStopWords()

['a',
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 "a's",
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'came',
 'can',
 'cannot',
 'cant',
 "can't",
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 "c'mon",
 'co',
 'com',
 'come',
 'comes',
 'concerning'

In [7]:
#Retorne a média e desvio padrão da quantidade de ocorrências
text1.frequencyMeanAndStdDeviation()

Média: 2.06, Desvio padrão: 1.00


In [13]:
#Retorne um novo arquivo eliminando todas as StopWords do texto
text1.createFileWithoutStopWords()

In [56]:
#Inclua um método que retorne a distância entre duas palavras
print(text1.similarityWords('teste','etset'))
print(text1.ownSimilarityWords('teste','etset'))

0.6
1.0
