Skip to content

Commit

Permalink
totally works, generates txt files so you don't have to scrape the Wi…
Browse files Browse the repository at this point in the history
…kipedia API all the time
  • Loading branch information
wrought committed Nov 13, 2011
1 parent f4da552 commit c0e82a6
Showing 1 changed file with 49 additions and 18 deletions.
67 changes: 49 additions & 18 deletions gender/sources/wikipedia_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,54 @@

# create variable to store all the requested names

_namesFdump= []
# _namesFdump = []
_namesList = []

# pass values for API request to get XML response
# pass values for API request to get json response
# But need to loop through all the categories
for row in _catsF[0:2]:
# store a connection
caturl = "http://en.wikipedia.org/w/api.php?format=json&action=query&list=categorymembers"+\
"&cmnamespace=0&cmlimit=100&cmtitle=Category:" + url.quote(row[0])
postsParsed = json.load(url.urlopen(caturl))
print json.dumps(postsParsed,indent=2)


def captions(posts):
'''Return "captions" from python-readable data.
'''
allCaptions = '' # Var to store the full list
justPosts = postsParsed[u'response'][u'posts'] # Var to grab just the post objects
for r in justPosts:
allCaptions = allCaptions + r[u'caption']
return allCaptions
def getNamesFromCat(categoryList):
for row in categoryList:
# store a connection
caturl = "http://en.wikipedia.org/w/api.php?format=json&action=query&list=categorymembers"+\
"&cmnamespace=0&cmlimit=100&cmtitle=Category:" + url.quote(row[0])
postsParsed = json.load(url.urlopen(caturl))
# print json.dumps(postsParsed,indent=2)
# _namesFdump = _namesFdump + [postsParsed]
for n in xrange(len(postsParsed[u'query'][u'categorymembers'])):
nameParens = postsParsed[u'query'][u'categorymembers'][n][u'title']
# print nameParens

if nameParens.find('(') != -1:
nameParens = nameParens[0:nameParens.find('(')]
# print nameParens
_namesList.append(nameParens)

def getAndWriteNames(categoryList, fileName):
getNamesFromCat(categoryList)
for n in xrange(len(_namesList)):
_namesList[n] = _namesList[n].encode('ascii', 'ignore') # convert unicode to ascii
f = file('data/'+fileName+'.txt', 'w')
text = '\n'.join(_namesList)
f.write(text)

getAndWriteNames(_catsF, 'wikiFemaleNames')
getAndWriteNames(_catsM, 'wikiMaleNames')
getAndWriteNames(_catsU, 'wikiUniNames')

# save to csv
'''testfile = open('data/testfile.csv', 'wb')
wr = csv.writer(testfile, quoting=csv.QUOTE_ALL)
wr.writerow(_namesFlist)
'''

# change to ascii from unicode
#for n in xrange(len(_namesFlist)):
# _namesFlist[n] = _namesFlist[n].encode('ascii', 'ignore')

# save to a text file
#f = file('data/wikiFemaleNames.txt', 'w')
#text = '\n'.join(_namesFlist)
#f.write(text)

# print _namesFlist # Debugging

0 comments on commit c0e82a6

Please sign in to comment.