In [129]:
# !pip install SpaCy
# !pip install pickleshare
# !pip install PyPDF2


In [130]:
import os
import json
import re
import numpy as np
import pandas as pd
from pprint import pprint

import PyPDF2

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michelle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/michelle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michelle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [131]:
pdf = PyPDF2.PdfReader('play.pdf')
text = pdf.pages[0].extract_text()


In [132]:
#Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [133]:
#Function to preprocess input document
def preprocess_document(text):
    processed_text = preprocess_text(text)

    processed_document = {}

    processed_document["Original_text"] = text
    processed_document["Processed_text"] = processed_text

    # print(f"Original Text: {processed_document['Original_text']}")
    # print(f"Processed Text: {processed_document['Processed_text']}")

    return processed_document['Processed_text']

processed = preprocess_document(text)

In [134]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(processed, min_count=5, threshold=100) # higher threshold fewer phrases.

bigram_mod = gensim.models.phrases.Phraser(bigram)


In [135]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


In [136]:
bigrams = make_bigrams([processed])
bigrams

[['5',
  'c',
  'reating',
  'stealth',
  'game',
  'intervention',
  'attitude',
  'behavior',
  'change',
  'n',
  'embedded',
  'design',
  'model',
  'geoff',
  'kaufman',
  'mary',
  'flanagan',
  'max',
  'seidman',
  'abstract',
  'chapter',
  'open',
  'example',
  'transformational',
  'game',
  'utilize',
  'overt',
  'explicit',
  'approach',
  'attitude',
  'behavior',
  'change',
  'acknowledging',
  'worthwhile',
  'intention',
  'game',
  'poten',
  'tial',
  'utility',
  'triggering',
  'reflection',
  'action',
  'overview',
  'present',
  'central',
  'premise',
  'chapter',
  'number',
  'fundamental',
  'reason',
  'explicit',
  'approach',
  'backfire',
  'limited',
  'utility',
  'persuasion',
  'use',
  'implicit',
  'covert',
  'approach',
  'persua',
  'sion',
  'effective',
  'embedded',
  'design',
  'model',
  'presented',
  'chapter',
  'particularly',
  'relevant',
  'game',
  'attempting',
  'engage',
  'player',
  'sensitive',
  'potentially',
  'threate

In [137]:
id2word = corpora.Dictionary(bigrams)

# Create Corpus
texts = bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 3), (16, 1), (17, 5), (18, 1), (19, 1), (20, 1), (21, 4), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 4), (29, 3), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 4), (38, 1), (39, 1), (40, 1), (41, 1), (42, 3), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 7), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 2), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 2), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 2), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 3), (85, 3), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

In [138]:
id2word[0]
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('1051179789463728805ch05', 1),
  ('2007', 1),
  ('2013', 1),
  ('2021', 1),
  ('5', 1),
  ('abstract', 1),
  ('accuracy', 1),
  ('acknowledging', 1),
  ('action', 1),
  ('activity', 1),
  ('address', 1),
  ('advanced', 1),
  ('aim', 1),
  ('al', 1),
  ('amsterdam', 2),
  ('approach', 3),
  ('attempting', 1),
  ('attitude', 5),
  ('awareness', 1),
  ('b', 1),
  ('backfire', 1),
  ('behavior', 4),
  ('behaviorsthrough', 1),
  ('benefit', 1),
  ('bias', 1),
  ('bogost', 1),
  ('c', 1),
  ('central', 1),
  ('change', 4),
  ('chapter', 3),
  ('cognitive', 1),
  ('context', 1),
  ('covert', 1),
  ('critical', 1),
  ('decade', 1),
  ('decisionmaking', 1),
  ('dela', 1),
  ('design', 4),
  ('doi', 1),
  ('dunbar', 1),
  ('effective', 1),
  ('eg', 1),
  ('embedded', 3),
  ('emergence', 1),
  ('encourage', 1),
  ('engage', 1),
  ('et', 1),
  ('example', 1),
  ('explicit', 2),
  ('flanagan', 1),
  ('fundamental', 1),
  ('game', 7),
  ('gameplay', 1),
  ('gaming', 1),
  ('gamut', 1),
  ('geoff'

In [149]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [147]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.008*"game" + 0.008*"attitude" + 0.008*"design" + 0.008*"persuasive" + '
  '0.008*"approach" + 0.008*"behavior" + 0.008*"change" + 0.008*"player" + '
  '0.008*"chapter" + 0.008*"embedded"'),
 (1,
  '0.038*"game" + 0.028*"attitude" + 0.022*"change" + 0.022*"design" + '
  '0.022*"behavior" + 0.017*"embedded" + 0.017*"chapter" + 0.017*"approach" + '
  '0.017*"persuasive" + 0.017*"player"'),
 (2,
  '0.008*"change" + 0.008*"game" + 0.008*"attitude" + 0.008*"behavior" + '
  '0.008*"design" + 0.008*"chapter" + 0.008*"persuasive" + 0.008*"player" + '
  '0.008*"approach" + 0.008*"stealth"'),
 (3,
  '0.008*"game" + 0.008*"design" + 0.008*"behavior" + 0.008*"change" + '
  '0.008*"player" + 0.008*"attitude" + 0.008*"approach" + 0.008*"persuasive" + '
  '0.008*"chapter" + 0.008*"stealth"'),
 (4,
  '0.009*"game" + 0.008*"attitude" + 0.008*"behavior" + 0.008*"design" + '
  '0.008*"player" + 0.008*"change" + 0.008*"embedded" + 0.008*"persuasive" + '
  '0.008*"approach" + 0.008*"stealth"')]


In [148]:
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')   
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis