aws-blog-event-driven-batch-analytics/src/main/java/com/amazonaws/bigdatablog/edba/LambdaContainer.java

package com.amazonaws.bigdatablog.edba;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Random;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;

import com.amazonaws.regions.Region;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient;
import com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsRequest;
import com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsResult;
import com.amazonaws.services.elasticmapreduce.model.Application;
import com.amazonaws.services.elasticmapreduce.model.ClusterState;
import com.amazonaws.services.elasticmapreduce.model.ClusterSummary;
import com.amazonaws.services.elasticmapreduce.model.DescribeClusterRequest;
import com.amazonaws.services.elasticmapreduce.model.DescribeStepRequest;
import com.amazonaws.services.elasticmapreduce.model.HadoopJarStepConfig;
import com.amazonaws.services.elasticmapreduce.model.ListClustersRequest;
import com.amazonaws.services.elasticmapreduce.model.ListClustersResult;
import com.amazonaws.services.elasticmapreduce.model.StepConfig;
import com.amazonaws.services.elasticmapreduce.model.StepState;
import com.amazonaws.services.elasticmapreduce.model.Tag;
import com.amazonaws.services.elasticmapreduce.util.StepFactory;
import com.amazonaws.services.lambda.runtime.Context;
import com.amazonaws.services.lambda.runtime.events.S3Event;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.event.S3EventNotification.S3EventNotificationRecord;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.util.StringUtils;

public class LambdaContainer {

	//Validation/Conversion Layer function
	
	public void validateAndNormalizeInputData(S3Event event,Context ctx) throws Exception{
		AmazonS3 s3Client;
		InputStream inputFileStream=null;
		InputStream readableDataStream=null;
				List<S3EventNotificationRecord> notificationRecords = event.getRecords();
				s3Client = new AmazonS3Client();
				String eventFileName,siteName,dbfName;
				CSVParser fileParser = null;
				for(S3EventNotificationRecord record : notificationRecords){
					 eventFileName = record.getS3().getObject().getKey();
					 S3Object s3Object = s3Client.getObject(new GetObjectRequest(record.getS3().getBucket().getName(), record.getS3().getObject().getKey()));
					 inputFileStream = s3Object.getObjectContent();
					 fileParser = new CSVParser(new InputStreamReader(inputFileStream),CSVFormat.TDF.withCommentMarker('-'));
					 List<CSVRecord> records = fileParser.getRecords();
					 StringWriter writer = new StringWriter();
					 CSVPrinter printer =null;
					 if(records.get(0).toString().matches(".*[^0-9].*")){
						 records.remove(0);
					 }
					 printer = new CSVPrinter(writer,CSVFormat.DEFAULT.withRecordSeparator(System.getProperty("line.separator")));
					 printer.printRecords(records);
					 printer.flush();
		        	 readableDataStream = new ByteArrayInputStream(writer.toString().getBytes("utf-8"));
		        	 s3Client.putObject(record.getS3().getBucket().getName(),"validated/"+eventFileName+".csv",readableDataStream,new ObjectMetadata());
		        	 printer.close();
		        	 readableDataStream.close();
		        }
	
		}
	

	// Tracking Input Layer lambda function
	public void auditValidatedFile(S3Event event,Context ctx) throws Exception{
		Connection conn  = new com.mysql.jdbc.Driver().connect(props.getProperty("url"), props);
		List<S3EventNotificationRecord> notificationRecords = event.getRecords();
		PreparedStatement ps = conn.prepareStatement(props.getProperty("sql.auditValidatedFile"));
		for(S3EventNotificationRecord record : notificationRecords){
			String fileURL = record.getS3().getBucket().getName()+"/"+record.getS3().getObject().getKey();
		       ps.setString(1, fileURL);
		       ps.setString(2, "VALIDATED");
		       ps.setString(3,"VALIDATED");
		       ps.addBatch();
		}
		ps.executeBatch();
		ps.close();
		conn.close();		
	}
	

	// EMR Job  Criteria Check and Submission lambda function
	
	public void checkConditionStatusAndFireEMRStep() throws Exception{
		Connection conn  = new com.mysql.jdbc.Driver().connect(props.getProperty("url"), props);
 		Statement conditionFetchStmt = conn.createStatement();
		ResultSet rs = conditionFetchStmt.executeQuery(props.getProperty("sql.conditionFetch"));
		PreparedStatement updateJobConfigPS = conn.prepareStatement(props.getProperty("sql.updateJobConfigStatus"));
		Statement jobInputFilesMinTimestampStmt = conn.createStatement();
		Statement updateSubmittedJobsStmt=conn.createStatement();
		List<String> activeClusters = getActiveTaggedClusters();
		String clusterId = null;
		while(rs.next()){
			System.out.println("job_input_pattern ::"+rs.getString("job_input_pattern"));
			System.out.println("sql.jobInputFilesMinTSAndCount :: "+props.getProperty("sql.jobInputFilesMinTSAndCount"));
			ResultSet conditionQueryResult = jobInputFilesMinTimestampStmt.executeQuery(props.getProperty("sql.jobInputFilesMinTSAndCount")+" "+rs.getString("job_input_pattern"));
			conditionQueryResult.next();
			if(conditionQueryResult.getTimestamp("min_lvt").after(rs.getTimestamp("last_run_timestamp")) 
					&&
					conditionQueryResult.getInt("file_count") >= rs.getInt("job_min_file_count")
					&&
					isAdditionalCriteriaPassed(rs.getString("job_addl_criteria"),conn)){
				clusterId = activeClusters.get(new Random().nextInt(activeClusters.size()-0));
				String jobId = fireEMRJob(rs.getString("job_params"),clusterId);
				updateJobConfigPS.setString(1,clusterId+":"+jobId);
				updateJobConfigPS.setString(2, rs.getString("job_config_id"));
				updateJobConfigPS.addBatch();
				updateSubmittedJobsStmt.addBatch(props.getProperty("sql.updateSubmittedJobsJSON").replaceAll("\\?", rs.getString("job_config_id"))+" "+rs.getString("job_input_pattern"));

			}
		}
		updateJobConfigPS.executeBatch();
		updateSubmittedJobsStmt.executeBatch();
		updateSubmittedJobsStmt.close();
		updateJobConfigPS.close();
		conditionFetchStmt.close();
		conn.close();
	}
	
	// EMR Job Monitor lambda function
	
	public void monitorEMRStep() throws Exception {
		List<String> stepIds = new ArrayList<String>();
		Connection conn  = new com.mysql.jdbc.Driver().connect(props.getProperty("url"), props);
		ResultSet openStepsRS = conn.createStatement().executeQuery(props.getProperty("sql.retrieveOpenSteps"));
		AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
		DescribeStepRequest stepReq=new  DescribeStepRequest();
		PreparedStatement ps = conn.prepareStatement(props.getProperty("sql.updateStepStatus"));
		while(openStepsRS.next()){
			
			stepReq.setClusterId(openStepsRS.getString("cluster_id"));
			stepReq.setStepId(openStepsRS.getString("step_id"));
			String stepState = emr.describeStep(stepReq).getStep().getStatus().getState();
			
				if(stepState.equals(StepState.COMPLETED.toString())){
					ps.setString(1,StepState.COMPLETED.toString());
				}else if (stepState.equals(StepState.FAILED.toString())){
					ps.setString(1,StepState.FAILED.toString());					
				}
				ps.setString(2,openStepsRS.getString("job_config_id"));
				ps.addBatch();
		}
		
		ps.executeBatch();
		ps.close();
		conn.close();
	}
	
	// adds EMR step
	protected String fireEMRJob(String paramsStr,String clusterId){
		StepFactory stepFactory = new StepFactory();
		AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
		emr.setRegion(Region.getRegion(Regions.fromName(System.getenv().get("AWS_REGION"))));
		Application sparkConfig = new Application()
				.withName("Spark");
		
		String[] params = paramsStr.split(",");
		StepConfig enabledebugging = new StepConfig()
				.withName("Enable debugging")
				.withActionOnFailure("TERMINATE_JOB_FLOW")
				.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
		
		HadoopJarStepConfig sparkStepConf = new HadoopJarStepConfig()
				.withJar("command-runner.jar")
				.withArgs(params);	
		
		final StepConfig sparkStep = new StepConfig()
				.withName("Spark Step")
				.withActionOnFailure("CONTINUE")
				.withHadoopJarStep(sparkStepConf);

		
		AddJobFlowStepsRequest request = new AddJobFlowStepsRequest(clusterId)
				.withSteps(new ArrayList<StepConfig>(){{add(sparkStep);}});
				

		AddJobFlowStepsResult result = emr.addJobFlowSteps(request);
		return result.getStepIds().get(0);
	}
	
	protected List<String> getActiveTaggedClusters() throws Exception{
		AmazonElasticMapReduceClient emrClient = new AmazonElasticMapReduceClient();
		List<String> waitingClusters = new ArrayList<String>();
		ListClustersResult clusterResult = emrClient.listClusters(new ListClustersRequest().withClusterStates(ClusterState.WAITING));
		
		DescribeClusterRequest specifcTagDescribe = new DescribeClusterRequest();
		specifcTagDescribe.putCustomQueryParameter("Cluster.Tags",null);
		 for( ClusterSummary cluster : clusterResult.getClusters()){
			 	System.out.println("list cluster id "+cluster.getId());
			 	List<Tag> tagList = emrClient.describeCluster(specifcTagDescribe.withClusterId(cluster.getId())).getCluster().getTags();
			 	for(Tag tag:tagList){
			 		if(tag.getKey().equals(props.getProperty("edba.cluster.tag.key"))){
			 			waitingClusters.add(cluster.getId());
			 		}
			 	}
			 	
		}
		return waitingClusters;
		
	}
	/**
	 * Checks whether additional criteria returned a non empty resultset.
	 */
	protected boolean isAdditionalCriteriaPassed(String sql, Connection conn) throws Exception{
		if(StringUtils.isNullOrEmpty(sql)){
			return true;
		}
		ResultSet rs = conn.createStatement().executeQuery(sql);
		if(!rs.next()){
			return false; // Empty Resultset
		}
		return true;
	}
	

	static Properties props=null;
	static{
		try{
			props = new Properties();
			props.load(LambdaContainer.class.getResourceAsStream("/edba_lambda_config.properties"));
			
		}catch(Exception ce){
			ce.printStackTrace();
			
		}
	}
	
		
}