From f663af056700d73c7479f64aea63dab0e1814a4b Mon Sep 17 00:00:00 2001 From: Sujen Shah Date: Mon, 1 Aug 2016 11:46:39 -0400 Subject: [PATCH] Fix for Nutch-2246: Refactor /seed end point --- src/java/org/apache/nutch/crawl/Injector.java | 33 +++++++---- src/java/org/apache/nutch/metadata/Nutch.java | 2 + .../org/apache/nutch/service/NutchServer.java | 7 +++ .../org/apache/nutch/service/SeedManager.java | 33 +++++++++++ .../nutch/service/impl/SeedManagerImpl.java | 58 +++++++++++++++++++ .../nutch/service/model/request/SeedList.java | 10 ++++ .../nutch/service/resources/SeedResource.java | 27 ++++++++- 7 files changed, 156 insertions(+), 14 deletions(-) create mode 100644 src/java/org/apache/nutch/service/SeedManager.java create mode 100644 src/java/org/apache/nutch/service/impl/SeedManagerImpl.java diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 383aaf154c..65757823f8 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -41,6 +41,7 @@ import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.LockUtil; +import org.apache.nutch.service.NutchServer; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.TimingUtil; @@ -477,11 +478,28 @@ public int run(String[] args) throws Exception { */ public Map run(Map args, String crawlId) throws Exception { - if (args.size() < 1) { - throw new IllegalArgumentException("Required arguments "); + if(args.size()<1){ + throw new IllegalArgumentException("Required arguments or "); + } + Path input; + Object path = null; + if(args.containsKey(Nutch.ARG_SEEDDIR)) { + path = args.get(Nutch.ARG_SEEDDIR); + } + else if(args.containsKey(Nutch.ARG_SEEDNAME)) { + path = NutchServer.getInstance().getSeedManager(). + getSeedList((String)args.get(Nutch.ARG_SEEDNAME)).getSeedFilePath(); + } + else { + throw new IllegalArgumentException("Required arguments or "); + } + if(path instanceof Path) { + input = (Path) path; + } + else { + input = new Path(path.toString()); } Map results = new HashMap(); - Path crawlDb; if (args.containsKey(Nutch.ARG_CRAWLDB)) { Object crawldbPath = args.get(Nutch.ARG_CRAWLDB); @@ -493,15 +511,6 @@ public Map run(Map args, String crawlId) } else { crawlDb = new Path(crawlId + "/crawldb"); } - - Path input; - Object path = args.get(Nutch.ARG_SEEDDIR); - if (path instanceof Path) { - input = (Path) path; - } else { - input = new Path(path.toString()); - } - inject(crawlDb, input); results.put(Nutch.VAL_RESULT, Integer.toString(0)); return results; diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java index de80399cdd..cbc3317c74 100644 --- a/src/java/org/apache/nutch/metadata/Nutch.java +++ b/src/java/org/apache/nutch/metadata/Nutch.java @@ -84,6 +84,8 @@ public interface Nutch { public static final String CRAWL_ID_KEY = "storage.crawl.id"; /** Argument key to specify location of the seed url dir for the REST endpoints **/ public static final String ARG_SEEDDIR = "url_dir"; + /** Argument key to specify name of a seed list for the REST endpoints **/ + public static final String ARG_SEEDNAME = "seedName"; /** Argument key to specify the location of crawldb for the REST endpoints **/ public static final String ARG_CRAWLDB = "crawldb"; /** Argument key to specify the location of linkdb for the REST endpoints **/ diff --git a/src/java/org/apache/nutch/service/NutchServer.java b/src/java/org/apache/nutch/service/NutchServer.java index e20670785f..6d531e0fcd 100644 --- a/src/java/org/apache/nutch/service/NutchServer.java +++ b/src/java/org/apache/nutch/service/NutchServer.java @@ -41,6 +41,7 @@ import org.apache.nutch.service.impl.ConfManagerImpl; import org.apache.nutch.service.impl.JobFactory; import org.apache.nutch.service.impl.JobManagerImpl; +import org.apache.nutch.service.impl.SeedManagerImpl; import org.apache.nutch.service.impl.NutchServerPoolExecutor; import org.apache.nutch.service.model.response.JobInfo; import org.apache.nutch.service.model.response.JobInfo.State; @@ -74,6 +75,7 @@ public class NutchServer { private boolean running; private ConfManager configManager; private JobManager jobManager; + private SeedManager seedManager; private JAXRSServerFactoryBean sf; private static FetchNodeDb fetchNodeDb; @@ -86,6 +88,7 @@ public class NutchServer { private NutchServer() { configManager = new ConfManagerImpl(); + seedManager = new SeedManagerImpl(); BlockingQueue runnables = Queues.newArrayBlockingQueue(JOB_CAPACITY); NutchServerPoolExecutor executor = new NutchServerPoolExecutor(10, JOB_CAPACITY, 1, TimeUnit.HOURS, runnables); jobManager = new JobManagerImpl(new JobFactory(), configManager, executor); @@ -149,6 +152,10 @@ public ConfManager getConfManager() { public JobManager getJobManager() { return jobManager; } + + public SeedManager getSeedManager() { + return seedManager; + } public FetchNodeDb getFetchNodeDb(){ return fetchNodeDb; diff --git a/src/java/org/apache/nutch/service/SeedManager.java b/src/java/org/apache/nutch/service/SeedManager.java new file mode 100644 index 0000000000..a96c4ac884 --- /dev/null +++ b/src/java/org/apache/nutch/service/SeedManager.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.service; + +import java.util.Map; + +import org.apache.nutch.service.model.request.SeedList; + +public interface SeedManager { + + public SeedList getSeedList(String seedName); + + public void setSeedList(String seedName, SeedList seedList); + + public boolean deleteSeedList(String seedName); + + public Map getSeeds(); +} diff --git a/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java new file mode 100644 index 0000000000..c7b7607924 --- /dev/null +++ b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.service.impl; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.nutch.service.SeedManager; +import org.apache.nutch.service.model.request.SeedList; + +public class SeedManagerImpl implements SeedManager { + + private static Map seeds; + + public SeedManagerImpl() { + seeds = new HashMap<>(); + } + + public SeedList getSeedList(String seedName) { + if(seeds.containsKey(seedName)) { + return seeds.get(seedName); + } + else + return null; + } + + public void setSeedList(String seedName, SeedList seedList) { + seeds.put(seedName, seedList); + } + + public Map getSeeds(){ + return seeds; + } + + public boolean deleteSeedList(String seedName) { + if(seeds.containsKey(seedName)) { + seeds.remove(seedName); + return true; + } + else + return false; + } +} diff --git a/src/java/org/apache/nutch/service/model/request/SeedList.java b/src/java/org/apache/nutch/service/model/request/SeedList.java index bbb3e2a1f0..5ba60da651 100644 --- a/src/java/org/apache/nutch/service/model/request/SeedList.java +++ b/src/java/org/apache/nutch/service/model/request/SeedList.java @@ -29,6 +29,8 @@ public class SeedList implements Serializable { private Long id; private String name; + private String seedFilePath; + @JsonManagedReference private Collection seedUrls; @@ -57,6 +59,14 @@ public void setName(String name) { this.name = name; } + public String getSeedFilePath() { + return seedFilePath; + } + + public void setSeedFilePath(String seedFilePath) { + this.seedFilePath = seedFilePath; + } + @JsonIgnore public int getSeedUrlsCount() { if (CollectionUtils.isEmpty(seedUrls)) { diff --git a/src/java/org/apache/nutch/service/resources/SeedResource.java b/src/java/org/apache/nutch/service/resources/SeedResource.java index 5261139ba0..638af332fa 100644 --- a/src/java/org/apache/nutch/service/resources/SeedResource.java +++ b/src/java/org/apache/nutch/service/resources/SeedResource.java @@ -24,8 +24,10 @@ import java.io.FileWriter; import java.io.IOException; import java.util.Collection; +import java.util.Map; import javax.ws.rs.Consumes; +import javax.ws.rs.GET; import javax.ws.rs.POST; import javax.ws.rs.Path; import javax.ws.rs.Produces; @@ -35,6 +37,7 @@ import javax.ws.rs.core.Response.Status; import org.apache.commons.collections.CollectionUtils; +import org.apache.nutch.service.NutchServer; import org.apache.nutch.service.model.request.SeedList; import org.apache.nutch.service.model.request.SeedUrl; import org.slf4j.Logger; @@ -47,6 +50,23 @@ public class SeedResource extends AbstractResource { private static final Logger log = LoggerFactory .getLogger(AdminResource.class); + /** + * Gets the list of seedFiles already created + * @return + */ + @GET + @Path("/") + @Produces(MediaType.APPLICATION_JSON) + public Response getSeedLists() { + Map seeds = NutchServer.getInstance().getSeedManager().getSeeds(); + if(seeds!=null) { + return Response.ok(seeds).build(); + } + else { + return Response.ok().build(); + } + } + /** * Method creates seed list file and returns temporary directory path * @param seedList @@ -70,8 +90,11 @@ public Response createSeedFile(SeedList seedList) { writeUrl(writer, seedUrl); } } - - return Response.ok().entity(seedFile.getParent()).build(); + String seedFilePath = seedFile.getParent(); + seedList.setSeedFilePath(seedFilePath); + NutchServer.getInstance().getSeedManager(). + setSeedList(seedList.getName(), seedList); + return Response.ok().entity(seedFilePath).build(); } private void writeUrl(BufferedWriter writer, SeedUrl seedUrl) {