Skip to content

Commit

Permalink
Include the java processor used by searchco.de
Browse files Browse the repository at this point in the history
  • Loading branch information
boyter committed Apr 4, 2012
1 parent 4ff9bbe commit eff98d3
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 0 deletions.
9 changes: 9 additions & 0 deletions java/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
This is not 100% correct for the FatHead project but was used to generate
the sources for http://searchco.de/ which were shared with DDG. I belive
the TSV format used here is the basis for that project though so it should be
pretty close.

Dependancy is BeautifulSoup for Python. http://www.crummy.com/software/BeautifulSoup/

Note that the fetch might not work correctly. This is due to Oracle requiring
you to accept terms and conditions.
1 change: 1 addition & 0 deletions java/data.url
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://download.oracle.com/otn-pub/java/jdk/6u30-b12/jdk-6u30-apidocs.zip
3 changes: 3 additions & 0 deletions java/fetch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Probably best to go to the below and accept otherwise this probably will not work
# http://www.oracle.com/technetwork/java/javase/downloads/jdk-6u25-doc-download-355137.html
wget http://download.oracle.com/otn-pub/java/jdk/6u30-b12/jdk-6u30-apidocs.zip
5 changes: 5 additions & 0 deletions java/meta.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Name: JDK 1.6
Domain: docs.oracle.com
Type: Java
MediaWiki: 0
Keywords: Java
136 changes: 136 additions & 0 deletions java/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os
import re
from BeautifulSoup import BeautifulSoup
import sys
import string

def findindex(haystack,needle):
count = 0
for line in haystack:
if needle in line:
return count
count += 1

def getsection(file,start,end):
html = ''
for i in file[start:end]:
html = "%s\r\n%s"%(html,i)
return html

def getall(file):
html = ''
for i in file:
html = "%s\r\n%s"%(html,i)
return html

r1 = re.compile('''<.*?>''',re.DOTALL)
findtr = re.compile('''<TR .*?>.*?</TR>''',re.DOTALL)
findtd = re.compile('''<TD>.*?</TD>''',re.DOTALL)
findtable = re.compile('''<TABLE .*?</TABLE>''',re.DOTALL)
findp = re.compile('''<P>.*?<P>''',re.DOTALL)
findpre = re.compile('''<PRE>.*?</PRE>''',re.DOTALL)
findh2 = re.compile('''<H2>.*?</H2>''',re.DOTALL)
findh3 = re.compile('''<H3>.*?</H3>''',re.DOTALL)
findcode = re.compile('''<code>.*?</code>''',re.DOTALL)
findcodeupper = re.compile('''<CODE>.*?</CODE>''',re.DOTALL)
findmethoddetail = re.compile('''<A NAME.*?<HR>''',re.DOTALL)
finda = re.compile('''<A NAME.*?>''',re.DOTALL)
findb = re.compile('''<B>.*?</B>''',re.DOTALL)
findddtop = re.compile('''<DD.*?<P>''',re.DOTALL)
findinherit = re.compile('''<B>Methods inherited from.*?</TABLE>''',re.DOTALL)
findopenclosetags = re.compile('''<.*?>|</.*?>''',re.DOTALL)
spaces = re.compile('''\s+''',re.DOTALL)

# java javax and org


#get all the files here
dirList = []


dir = "./docs/java/en/api/java/"

for (path,dirs,files) in os.walk(dir):
if 'class-use' not in path:
for f in files:
dirList.append("%s/%s"%(path,f))

dir = "./docs/java/en/api/javax/"

for (path,dirs,files) in os.walk(dir):
if 'class-use' not in path:
for f in files:
dirList.append("%s/%s"%(path,f))



first = True

for fname in dirList:
file = []

#if fname == 'XmlAnyElement.html':
#if fname == 'RandomAccess.html': # interface
if fname.endswith('.html') and 'package-' not in fname and 'doc-files' not in fname:
for line in open("%s"%(fname)):
line = line.strip().replace("'",'')
line = ''.join(filter(lambda x:x in string.printable, line))
file.append(line)

start = findindex(file,"START OF CLASS DATA")
consum = findindex(file,"CONSTRUCTOR SUMMARY")
methsum = findindex(file,"METHOD SUMMARY")
condet = findindex(file,"CONSTRUCTOR DETAIL")
methdet = findindex(file,"METHOD DETAIL")
end = findindex(file,"END OF CLASS DATA")

#finds the name and namespace
np = findh2.findall(getall(file))[0]
np = np.split('<BR>')
namespace = r1.sub('',np[0]).strip()
classtype = r1.sub('',np[1]).strip()

#if its an interface skip it
if 'interface' in classtype.lower():
continue

#finds the description which is the large text at the beginning
desc = findp.findall(getall(file))[0]

# print the object

name = fname.split('/')[-1].replace('.html','')
url = "http://download.oracle.com/javase/6/docs/%s"%(fname.replace('./docs/java/en/',''))
description = spaces.sub(' ',findopenclosetags.sub('',desc).strip())


print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"%(name,namespace,url,description,'','','java','en')

#finds all inherited methods
for i in findinherit.findall(getall(file)):
description = spaces.sub(' ',findopenclosetags.sub('',findb.findall(i)[0].replace('Methods','Method').replace('<B>','').replace('</B>','')))
#print detail
for j in findcodeupper.findall(i)[0].replace('<CODE>','').replace('</CODE>','').split('>, '):
#synopsis = j.strip().replace('</A','</A>').replace('>>','>')
synopsis = ''
methodname = r1.sub('',j).replace('</A','').strip()
url = 'http://download.oracle.com/javase/6/docs/%s#%s'%(fname.replace('./docs/java/en/',''),methodname)
namespaceinherited = "%s.%s"%(namespace,name)

print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"%(methodname,namespaceinherited,url,description,synopsis,'','java','en')


#finds all methoddetailinfo
for meth in findmethoddetail.findall("%s<HR>"%(findtable.sub('',getsection(file,methdet,end)).replace('<A NAME="method_detail"><!-- --></A>',''))):
try:
methodname = r1.sub('',findh3.findall(meth)[0]).strip()
methodurl = finda.findall(meth)[0]
methodurl = methodurl.replace('<A NAME="','').replace('">','')
url = 'http://download.oracle.com/javase/6/docs/%s#%s'%(fname.replace('./docs/java/en/',''),methodurl)
synopsis = findopenclosetags.sub('',findpre.findall(meth)[0].replace('<PRE>','').replace('</PRE>','').replace("\r\n",'').strip())
description = spaces.sub(' ',findopenclosetags.sub('',findddtop.findall(meth)[0].replace('<DD>','').replace('<P>','')))
namespaceinherited = "%s.%s"%(namespace,name)

print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"%(methodname,namespaceinherited,url,description,synopsis,'','java','en')
except:
pass

0 comments on commit eff98d3

Please sign in to comment.