A simple ETL project interlinking the various components of Hadoop Ecosystem
- creating table in MySql
create table studdetails(id integer(10), firstname varchar(20), lastname varchar(30), age integer(10),
phone integer(14), location varchar(30));
- insert some values
insert into studdetails values(001,"Rajiv","Reddy",21,9848022337,"Hyderabad"); insert into studdetails values(002,"siddarth","Battacharya",22,9848022338,"Kolkata"); insert into studdetails values(003,"Rajesh","Khanna",22,9848022339,"Delhi"); insert into studdetails values(004,"Preethi","Agarwal",21,9848022330,"Pune"); insert into studdetails values(005,"Trupthi","Mohanthy",23,9848022336,"Bhuwaneshwar"); insert into studdetails values(006,"Archana","Mishra",23,9848022335,"Chennai"); insert into studdetails values(007,"Komal","Nayak",24,9848022334,"trivendram"); insert into studdetails values(008,"Bharathi","Nambiayar",24,9848022333,"Chennai");
sqoop import --connect jdbc:mysql://localhost/appu1 --driver com.mysql.jdbc.Driver --username user
--password 12345 --table studdetails --hive-import -m 1 --hive-table appu.studdetails1
create table studentpart(id int, firstname String, lastname String, phone bigint, loc String) PARTITIONED BY
(age INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE SET hive.exec.dynamic.partition = true; SET hive.exec.dynamic.partition.mode = nonstrict; // we are partitioning based on age INSERT OVERWRITE TABLE studentpart PARTITION(age) SELECT id,firstname,lastname,phone,
loc,age from studdetails1; // Check your partitions show partitions studentpart; // view data based on partitions select * from studentpart where age='21';
pig -useHCatalog A = LOAD 'appu.studentpart' USING org.apache.hive.hcatalog.pig.HCatLoader(); B = filter A by age == 24; store B into 'studentjson' USING JsonStorage();
import org.apache.spark.sql.SparkSession val session = SparkSession.builder().appName("test").master("local").getOrCreate() val df = session.read.json("part-m-000000") df.show()
//create a database and a collection use student db.createCollection('studentdetails') //now exit the mongo shell //now use mongo import mongoimport -d student -c studentdetails part-m-000000