In [86]:
# general
import pandas as pd
import numpy as np
import os
import re
import csv
from collections import defaultdict
import pprint

# visualizations
import matplotlib.pyplot as plt

# NLP
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob

# modeling
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [3]:
# define houses
houses = ['gryffindor', 'ravenclaw', 'hufflepuff', 'slytherin']
primary_colors = ['#ae0001','#222f5b','#ecb939','#2a623d']
secondary_colors = ['#eeba30','#5d5d5d','#726255','#aaaaaa']

In [4]:
""" 
    make a dataframe that contains the chapters, textblobs of the chapters,
    houses, frequency per chapter and polarity per chapter
"""
files_dir = '/home/brendanfitzpatrick/Metis/projects/05-kojak/.gitignore/files/books/'
column_names = ['chapter','text','book_name','book_number']
df = pd.DataFrame(columns = column_names)

# import fanastic beasts and add to dataframe
fantasticbeasts = '/home/brendanfitzpatrick/Metis/projects/05-kojak/.gitignore/files/superseded/fantasticbeast.txt'
fantastic_beasts_text = []
with open(fantasticbeasts) as f:
    scene = ''
    for line in f:
        if re.match("SCENE [\d]+", line):
            fantastic_beasts_text+=[scene]
            scene = ''
        else:
            scene+=line.replace("\n"," ")
    fantastic_beasts_text = fantastic_beasts_text[1:124]+[scene]
df_fanastic_beasts = pd.DataFrame(
    {
        'chapter': [x for x in range(1,125)],
        'text': fantastic_beasts_text,
        'book_name': ['Fantastic Beasts']*124,
        'book_number': [8]*124,
    }
)
df = pd.concat([df_fanastic_beasts, df])

# textblob, house counts and polarity
for filename in sorted(os.listdir(files_dir)):
    temp = pd.read_csv(files_dir+filename,header=None,names=['chapter','text'])
    temp['book_name'] = str(filename[2:-4])
    temp['book_number'] = int(filename[0:1])
    df = pd.concat([df, temp])
df = df[df.chapter.isnull()==False]
df['textblob'] = df.text.apply(lambda x: TextBlob(x))
df['house_counts_GRHS'] = df.textblob.apply(lambda x: [x.words.count(y) for y in houses])
df['chapter_polarity'] = df.textblob.apply(lambda x: x.sentiment.polarity)
df.sort_values(['book_number','chapter'],ascending=True,inplace=True)
df.reset_index(drop=True,inplace=True)

In [5]:
documents = list(df.text)
print('\n')





In [6]:
# display topics functions:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [7]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical maodel
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [8]:
no_topics = 20
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [9]:
no_top_words = 20
print('NMF Topics: ')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('LDA Topics: ')
display_topics(lda, tf_feature_names, no_top_words)

NMF Topics: 
Topic 0:
harry said ron hermione looked know ve just like got malfoy professor did don ll looking think time right dumbledore
Topic 1:
newt niffler jacob angle obscurus pickett demiguise bank case tina moves gnarlak customs watches looks occamy erumpent park scamander swooping
Topic 2:
graves credence obscurus subway child moves tunnel begins face tracks mass night continues alleyway modesty stands stares aurors int train
Topic 3:
vernon dudley uncle harry petunia aunt dursleys marge letter dursley car kitchen figg house birthday room living owl piers drive
Topic 4:
jacob bingley newt case int bank kowalski occamy room egg looks bakery mr enters grabs day hey collateral factory canning
Topic 5:
dumbledore harry voldemort said did riddle death fudge sirius wand lord yes tom slughorn albus grindelwald prophecy kreacher know man
Topic 6:
church modesty lou mary salem credence chastity leaflets int momma second stands sits children night stairs belt glances bags ext
Topic 7:
w

In [56]:
test_hermione = ['Minister Hermione Jean Granger was a Muggle-born witch born to Mr and Mrs Granger, both dentists. Hermione was raised as a Muggle girl until, at age eleven, when she learned that she was a witch and had been accepted into Hogwarts School of Witchcraft and Wizardry. She began attending the school on 1 September, 1991, where she was subsequently sorted into Gryffindor House, despite being considered for Ravenclaw. She possessed a brilliant academic mind, and proved to be a gifted student in almost every subject that she studied. She was very studious and bookish']

In [75]:
test_hermione

['Minister Hermione Jean Granger was a Muggle-born witch born to Mr and Mrs Granger, both dentists. Hermione was raised as a Muggle girl until, at age eleven, when she learned that she was a witch and had been accepted into Hogwarts School of Witchcraft and Wizardry. She began attending the school on 1 September, 1991, where she was subsequently sorted into Gryffindor House, despite being considered for Ravenclaw. She possessed a brilliant academic mind, and proved to be a gifted student in almost every subject that she studied. She was very studious and bookish']

In [51]:
test_hermione = tf_vectorizer.transform(test_hermione)

In [52]:
test_hermione=nmf.transform(test_hermione)

In [53]:
print(test_hermione)

[[ 0.2166643   0.          0.          0.          0.          0.
   0.00213997  0.43863143  0.          0.          0.          0.          0.
   0.          0.          0.          0.28088187  0.          0.          0.        ]]


In [15]:
nmf.components_.shape

(20, 14335)

In [18]:
model_houses = ['GRYFFINDOR',
'HUFFLEPUFF',
'SLYTHERIN',
'MUGGLE',
'UNDETERMINED',
'GRYFFINDOR / SLYTHERIN',
'SLYTHERIN',
'GRYFFINDOR / RAVENCLAW',
'UNDETERMINED',
'MUGGLE',
'GRYFFINDOR',
'SLYTHERIN',
'UNDETERMINED',
'UNDETERMINED',
'UNDETERMINED',
'GRYFFINDOR / SLYTHERIN',
'RAVENCLAW',
'GRYFFINDOR',
'UNDETERMINED',
'RAVENCLAW / SLYTHERIN']

In [35]:
model_houses[-1].find(' ')
house1, blackslash, house2 = model_houses[-1].split(' ')
print(house1, house2)

RAVENCLAW SLYTHERIN


In [92]:
def find_score(bio):
    scores = tf_vectorizer.transform(bio)
    scores = nmf.transform(scores)[0]
    house_scores = defaultdict(int)
    for i in range(len(scores)):
        if model_houses[i].find(' ')!=-1:
            house1, blackslash, house2 = model_houses[i].split(' ')
            house_scores[house1] += .5*scores[i]
            house_scores[house1] += .5*scores[i]
        else:
            house_scores[model_houses[i]] += scores[i]
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(dict(house_scores))
    return max(house_scores, key=house_scores.get)

In [93]:
hermione_wiki = ['Minister Hermione Jean Granger was a Muggle-born witch born to Mr and Mrs Granger, both dentists.'+
                 'Hermione was raised as a Muggle girl until, at age eleven, when she learned that she was a witch and '+
                 'had been accepted into Hogwarts School of Witchcraft and Wizardry. She began attending the school on 1 September,'+
                 ' 1991, where she was subsequently sorted into Gryffindor House, despite being considered for Ravenclaw. '+
                 'She possessed a brilliant academic mind, and proved to be a gifted student in almost every subject that she '+
                 'studied. She was very studious and bookish']

In [94]:
find_score(hermione_wiki)

{   'GRYFFINDOR': 0.65529573521546636,
    'HUFFLEPUFF': 0.0,
    'MUGGLE': 0.0,
    'RAVENCLAW': 0.2808818721699165,
    'SLYTHERIN': 0.0021399749341124213,
    'UNDETERMINED': 0.0}


'GRYFFINDOR'

In [95]:
malfoy_wiki = ['Draco Lucius Malfoy was a pure-blood wizard and the only son of Lucius and Narcissa Malfoy '+
               '(née Black). The son of a Death Eater, Draco was raised to believe strongly in the importance '+
               'of blood purity. He attended Hogwarts School of Witchcraft and Wizardry from 1991-1998 and was '+
               'sorted into Slytherin House. During his years at Hogwarts, he became friends with Vincent Crabbe, '+
               'Gregory Goyle, Pansy Parkinson, and other fellow Slytherins, but he quickly developed a rivalry '+
               'with Harry Potter. He was made a prefect of his house and was a member of the Inquisitorial Squad'+
               ' during his fifth year, at the end of which his father was imprisoned in Azkaban following the '+
               'Battle of the Department of Mysteries. Lord Voldemort charged Draco with making up for Lucius\'s'+
               ' failure, and he became a Death Eater at age sixteen, but was quickly disillusioned with the '+
               'lifestyle. Draco was unable to complete his task — murdering Albus Dumbledore, which was taken '+
               'over by Severus Snape — and only performed his other duties fearfully and reluctantly. He and his '+
               'family defected hours before the end of the Second Wizarding War in order to avoid imprisonment '+
               'in Azkaban following Voldemort\'s defeat. By 2017, Draco had married Astoria Greengrass and '+
               'had one child, Scorpius Hyperion Malfoy.']

In [96]:
find_score(malfoy_wiki)

{   'GRYFFINDOR': 1.5752482502785261,
    'HUFFLEPUFF': 0.0,
    'MUGGLE': 0.0071466730865528093,
    'RAVENCLAW': 0.032161392116326716,
    'SLYTHERIN': 1.039554026312238,
    'UNDETERMINED': 0.026133179366236062}


'GRYFFINDOR'

In [97]:
kanye_wiki = ['Kanye Omari West is an American rapper, singer, songwriter, record producer, fashion designer, '+
               'and entrepreneur. Born in Atlanta and raised in Chicago, West briefly attended art school before '+
               'becoming known as a producer for Roc-A-Fella Records in the early 2000s, producing hit singles '+
               'for artists such as Jay-Z and Alicia Keys. Intent on pursuing a solo career as a rapper, West '+
               'released his debut album The College Dropout in 2004 to widespread critical and commercial success,'+
               ' and founded the record label GOOD Music. He went on to pursue a variety of styles on subsequent'+
               ' albums Late Registration (2005), Graduation (2007), and 808s & Heartbreak (2008). In 2010, he '+
               'released his fifth album My Beautiful Dark Twisted Fantasy to rave reviews from critics, and '+
               'the following year he released the collaborative album Watch the Throne with Jay-Z. West'+
               ' released his abrasive sixth album, Yeezus, to further critical praise in 2013. His seventh album,'+
               ' The Life of Pablo, was released in 2016.']

In [98]:
find_score(kanye_wiki)

{   'GRYFFINDOR': 0.079559523500513624,
    'HUFFLEPUFF': 0.0,
    'MUGGLE': 0.0,
    'RAVENCLAW': 0.0,
    'SLYTHERIN': 0.013292260671051909,
    'UNDETERMINED': 0.0}


'GRYFFINDOR'