# Downloading of Disney fandom data

The code in this notebook is used for downloading all the Disney-fandom wiki pages and making lists of the characters.

In [4]:
# Import packages
import urllib.request 
import json 
import re 
import math
import os
from urllib.parse import quote
import nltk, pprint
import numpy as np
import matplotlib.pyplot as plt

In [7]:
# Using api from disney.fandom
# input: title of page to get
#output: json object of the page
def getpage(title,fandom="disney"):
    baseurl = f"https://{fandom}.fandom.com/api.php?"
    action = "action=query"
    title = f"titles={quote(title)}"
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    rvslots= "rvslots=main"
    query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, title, dataformat,rvslots)
    #print(query)
    wikiresponse = urllib.request.urlopen(query)
    wikidata = (wikiresponse.read())
    wikitext = wikidata.decode('utf-8')
    jsontext =json.loads(wikitext)
    return jsontext

def getpagetext(title,fandom="disney"):
    #print(title)
    jsontext=getpage(title,fandom)
    key=jsontext["query"]["pages"].keys()
    webpagetext="0"
    for ints in key:
        key1=jsontext["query"]["pages"][ints]
        if ('revisions' in key1): 
            webpagetext=key1['revisions'][0]['slots']['main']['*']
        else:
            print(f"error with {title} ")
    return webpagetext
#Writes a string to a file
#input: Filename, string, optional path.
#output: None
#example:  write_to_file("testfile","testtext")
def write_to_file(filename,text,place=""):
    if (len(text) >1000):
        #open text file
        text_file = open(place+filename.replace(":","SEMICOLON")+".txt", "w", encoding= 'utf-8') # ':' can't be part of filename
        #print("saving in: "+place+filename+".txt")
        #write string to file
        text_file.write(text)
        #close file
        text_file.close()
def open_file(filename,place=""):
    text_file = open(place+filename.replace(":","SEMICOLON")+".txt", "r", encoding= 'utf-8') # ':' can't be part of filename
    return text_file.readlines()
    
#Downloads a webpage from the fandom and saves it locally
#input: Page to download
#output:none
#example create_character_file("Goofy")
def create_character_file(character):
    text=getpagetext(character)
    write_to_file(character,text)

#Using API to get all pages from a category, excluding the new categories found.
#input: A category
#output: a list of all the names found in the category, excluding any categories.
#example 
def get_all_names_from_category(category_in,fandom="disney"):
    names=[]
    continues=""
    baseurl = f"https://{fandom}.fandom.com/api.php?"
    action = "action=query"
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    formatversion="formatversion=2"
    lists="list=categorymembers"
    cmlimit="cmlimit=500"
    category=f"cmtitle=Category%3A{category_in}"
    while (True):
        cmcontinue=f"cmcontinue={continues}"
        query = "{}{}&{}&{}&{}&{}&{}&{}".format(baseurl,action,dataformat,formatversion,lists,cmlimit,category,cmcontinue)
    
        wikiresponse = urllib.request.urlopen(query)
        wikidata = (wikiresponse.read())
        wikitext = wikidata.decode('utf-8')
        jsontext =json.loads(wikitext)
        for i in range(len(jsontext['query']['categorymembers'])): #append found names to names[]
            names.append(jsontext['query']['categorymembers'][i]['title'])
        try: #look for cmcontinue for next page
            continues=  jsontext['continue']['cmcontinue']
        except:
            break
    cleaned=[name for name in names if name[0:8]!='Category']
    print(f"found {len(cleaned)} in category:{category_in}")
    return cleaned
# Get all names from a category and write them to a file
# input: category to get names from
# output: list of names from category
# example: save_characters_from_category_to_file("Villains",category)
def save_characters_from_category_to_file(category,path="",fandom="disney"):
    returned=get_all_names_from_category(category,fandom)
    write_to_file(category,"|||".join(returned),path)
    return returned

# download all pages from a list of pagenames using API
# input: list of pages,optional path to save files
# output: list of pages that gave errors
# example: get_pages_from_list(['22', 'A-Li', 'Abby Park', 'Princess Abigail', 'Ada'])
def get_pages_from_list(character_list,path="",fandom="disney"):
    search=""
    error_titles=[]
    length=len(character_list)
    print(length)
    for i in range (length):
        print(f"\rcalculating {((i*(1/length)*100)):5.2f}% completed, please wait", end = ' ')
        search+=character_list[i]+'|'
        if (i%49==48 or i==(length-1)):
            result=getpage(search,fandom)
            keys=result['query']['pages'].keys()
            for key in keys:
                if int(key)>0:
                    content=result['query']['pages'][key]['revisions'][0]['slots']['main']['*']
                    title=result['query']['pages'][key]['title']
                    try:
                        write_to_file(title.replace("/",""),content,path)
                    except:
                        error_titles.append(title)
                    search=""
            
    print(f"\rcalculating {100:5.2f}% completed, done!               ")
    return error_titles

#Exstracts the films from a characters textfile
# input: filename
# output: list of films character is present in
# example: get_films_from_character_file("Goofy.txt")
def get_films_from_character_file(filename):
    string = open(filename, "r", encoding="utf-8").read()
    string=string.split('\n')
    for line in string:
        if line[:5]=='|film':
            filter= r'\[\[([\w]*[\s\w*]*)\]\]'
            match = re.findall(filter, line)
            return match


## Generate a list of all characters

In [12]:
Category="Characters"
Characters=save_characters_from_category_to_file(Category,"Universes//"+Category)

found 11483 in category:Characters


## Generate gender lists

In [11]:
Category="Males"
Males=save_characters_from_category_to_file(Category,"Universes//"+Category)
Category="Females"
Females=save_characters_from_category_to_file(Category,"Universes//"+Category)

found 9726 in category:Males
found 5341 in category:Females


## Generating universe lists

In [None]:
Category="Star_Wars_characters"
Star_Wars_characters=save_characters_from_category_to_file(Category,Category)
Category="Marvel_Comics_characters"
Marvel_Comics_characters=save_characters_from_category_to_file(Category,"Universes//"+Category)
Category="Pixar_characters"
Pixar_characters=save_characters_from_category_to_file(Category,"Universes//"+Category)
Category="Disney_characters"
Disney_characters=save_characters_from_category_to_file(Category,"Universes//"+Category)

## Generate Villians and heroes/heroines

In [None]:
Category="Villains"
Villains=save_characters_from_category_to_file(Category,"Allignment//"+Category)
Villains_female=[Villain for Villain in Villains if Villain in Females]
Villains_male=[Villain for Villain in Villains if Villain in Males]
Villains_undefined=[Villain for Villain in Villains if Villain not in (Villains_female+Villains_male)]

Category="Heroes"
Heroes=save_characters_from_category_to_file(Category,"Allignment//"+Category)
Category="Heroines"
Heroines=save_characters_from_category_to_file(Category,"Allignment//"+Category)

## Downloading all character pages and saving locally

In [None]:
Error_pages=get_pages_from_list(Characters,"CharacterPages//")
print(f"Done downloading and saving, {len(Error_pages)} downloads had an error")