In [1]:
import re
import io
import collections
import copy
import os, errno
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
from __future__ import division
from nltk.book import *
import random
from scipy.sparse import *
import nltk
from nltk.tokenize import RegexpTokenizer
import operator
import pandas
from wordcloud import WordCloud
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


# TF-IDF 

We are going to find the most important words in the books according to the TFIDF criterion. This criterion consists of 2 parts:

  * TF: Term frequency, in each book see how much time each word appears
  * IDF: Inverse document frequency, see how important this word is compared to its apparition in other books.
  
The result is a TFIDF matrix, in our case the rows are corresponding to the books and the colmuns to the words.


In [4]:
#We compute the TFIDF matrix on our books
filenames=['./books/'+f for f in os.listdir('./books/')]
vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word', 
           stop_words='english',input='filename')
book_tfidf = vect.fit_transform(filenames)

Now that we have the TFIDF matrix we can visualise it in a word cloud. We have done a word cloud for the 7 different books:

In [5]:

booknames=[f[:-4] for f in listdir('./books/')]

#we will remove the text that is in the title of the books from the TFIDF count
textbooknames=''
for bookname in booknames:
    textbooknames+=bookname.lower()+' '
booktitles=nltk.regexp_tokenize(textbooknames,r'[a-zA-Z]+')

#list of words corresponding to the colums of the matrix
word_list=vect.get_feature_names()

N_wordcloud=book_tfidf.multiply(5000)

#picture locations for the wordcloud
listpicture=[loc+f for f in os.listdir('./pictures_WC/')]

#we first create a long string to feed to the wordcloud
for counterbook, book in enumerate(booknames):
    print book
    s=''

    for  counterword,word in enumerate(word_list):

        if word not in booktitles:
            n=int(round(N_wordcloud[counterbook,counterword]))
            s+=(word+' ')*n


    # plot and save the wordcloud
    mask_HP = np.array(Image.open(listpicture[np.mod(counterbook,4)]))
    wordcloud = WordCloud(background_color="white",max_font_size=40,collocations = False,mask=mask_HP).generate(s)
    fig=plt.figure()
    fig.set_size_inches(15,10)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    fig.savefig('Wordcloud_'+book)    
    plt.show()

NameError: name 'listdir' is not defined

# Spell count

In [None]:
#import text of the Books    
f = open("books/Book 1 - The Philosopher_s Stone.txt")
text1 = f.read()

f = open("books/Book 2 - The Chamber of Secrets.txt")
text2 = f.read()

f = open("books/Book 3 - The Prisoner of Azkaban.txt")
text3 = f.read()

f = open("books/Book 4 - The Goblet of Fire.txt")
text4 = f.read()

f = open("books/Book 5 - The Order of the Phoenix.txt")
text5 = f.read()

f = open("books/Book 6 - The Half Blood Prince.txt")
text6 = f.read()

f = open("books/Book 7 - The Deathly Hallows.txt")
text7 = f.read()

texts=[text1,text2,text3,text4,text5,text6,text7]

for n,i in enumerate(texts):
    texts[n]=i.replace('"'," ")

In [None]:
df = pandas.read_excel('dataset spells.xlsx')
values = df['Incantation'].values

In [None]:
#list of things to exclude
stopwords = nltk.corpus.stopwords.words('english')
stopwords+=[".",",","-","?",';',":","'","'","--","``","''"]
stopwords = set(stopwords)
exclude_words=stopwords
tokenizer = RegexpTokenizer(r'\w+')

#%%%%%%%%%%%%%%%%%%%%%%%%%%

texts=[text1,text2,text3,text4,text5,text6,text7]

tokens=[[],[],[],[],[],[],[]]

for n,text in enumerate(texts):

    tokens[n]= nltk.regexp_tokenize(text,r'[a-zA-Z]+')
    tokens[n]=[t.lower() for t in tokens[n] if t.lower() not in exclude_words if len(t)>3]

In [None]:
freq=FreqDist(tokens[6])

#fig= plt.figure()
#fig.set_size_inches(15,4)
#freq.plot(75,cumulative=True, figure=fig)

In [None]:
val_list=[item.lower() for val in values for item in val.split()]
val_list=set(val_list)


In [None]:
dirt=['skin','spell' ,'ears','curse','flame','cheering'
,'sticking','point','flying','line','pack','head','switching'
,'freezing','gripping','babbling','stinging','avis','unbreakable'
,'extinguishing','conjunctivitis','aqua','bubble','prior','cave','drought','jinx']

for d in dirt:
    val_list.remove(d)

In [None]:
resultName=[]
resultValue=[]
result=[]
for val in val_list:
    if val in freq:
        
        #print "%s : %s"%(val,freq[val])
        result.extend([val,freq[val]])
        resultName.append(val)
        resultValue.append(freq[val])


In [None]:
type(freq)

#fig= plt.figure()
#fig.set_size_inches(15,4)
#result.plot(75,cumulative=True, figure=fig)