In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row

conf = SparkConf().setMaster("local").setAppName("dataFrameExamples")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)



In [2]:
l = [('Ankit', 25), ('Jalfaizy', 22), ('saurabh', 20), ('Bala', 26)]
rdd = sc.parallelize(l)
people = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
schemaPeople = sqlContext.createDataFrame(people)

type(schemaPeople)

pyspark.sql.dataframe.DataFrame

In [5]:
orders = sqlContext.read.csv("/home/jovyan/examples/data-samples/spark-examples/customer-orders.csv", inferSchema = True, header = True)
orders.show()

+---+----+-----+
| 44|8602|37.19|
+---+----+-----+
| 35|5368|65.89|
|  2|3391|40.64|
| 47|6694|14.98|
| 29| 680|13.08|
| 91|8900|24.59|
| 70|3959|68.68|
| 85|1733|28.53|
| 53|9900|83.55|
| 14|1505| 4.32|
| 51|3378| 19.8|
| 42|6926|57.77|
|  2|4424|55.77|
| 79|9291|33.17|
| 50|3901|23.57|
| 20|6633| 6.49|
| 15|6148|65.53|
| 44|8331|99.19|
|  5|3505|64.18|
| 48|5539|32.42|
| 47|9900|25.66|
+---+----+-----+
only showing top 20 rows



In [6]:
type(orders)

pyspark.sql.dataframe.DataFrame

In [7]:
orders.printSchema()

root
 |-- 44: integer (nullable = true)
 |-- 8602: integer (nullable = true)
 |-- 37.19: double (nullable = true)



In [11]:
Employee = Row("firstName", "lastName", "email", "salary")

employee1 = Employee('Basher','armbrust', 'bash@lp.com', 100000)
employee2 = Employee('Hyuk', 'Kim', 'hkim@lp.com', 90000)
employee3 = Employee('Kevin','Stern', 'kStern@lp.com', 400000)

#print(Employee[0])
#print(employee1)

department1 = Row(id='12345', name='HR')
department2 = Row(id='73802', name='DEV')
department3 = Row(id='42727', name='Ops')

departmentWithEmployee1 = Row(department=department1, employees=[employee1, employee2, employee3])
departmentWithEmployee2 = Row(department=department2, employees=[employee2, employee3])

print(departmentWithEmployee2)

Row(department=Row(id='73802', name='DEV'), employees=[Row(firstName='Hyuk', lastName='Kim', email='hkim@lp.com', salary=90000), Row(firstName='Kevin', lastName='Stern', email='kStern@lp.com', salary=400000)])


In [12]:
departmentWithEmployees_Seq = [departmentWithEmployee1, departmentWithEmployee2]
dframe = sqlContext.createDataFrame(departmentWithEmployees_Seq)
display(dframe)
dframe.show()

DataFrame[department: struct<id:string,name:string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:bigint>>]

+------------+--------------------+
|  department|           employees|
+------------+--------------------+
| [12345, HR]|[[Basher, armbrus...|
|[73802, DEV]|[[Hyuk, Kim, hkim...|
+------------+--------------------+



TypeError: Can not infer schema for type: <class 'str'>

In [38]:
officeSupply_df = sqlContext.read.csv("/home/jovyan/examples/data-samples/spark-examples/P1-OfficeSupplies.csv", inferSchema = True, header=True)
officeSupply_df.show()
officeSupply_df.printSchema()
print(officeSupply_df.columns)
officeSupply_df.count()

+-----------+-------+-------+-------+-----+----------+
|  OrderDate| Region|    Rep|   Item|Units|Unit Price|
+-----------+-------+-------+-------+-----+----------+
| 4-Jul-2014|   East|Richard|Pen Set|   62|      4.99|
|12-Jul-2014|   East|   Nick| Binder|   29|      1.99|
|21-Jul-2014|Central| Morgan|Pen Set|   55|     12.49|
|29-Jul-2014|   East|  Susan| Binder|   81|     19.99|
| 7-Aug-2014|Central|Matthew|Pen Set|   42|     23.95|
|15-Aug-2014|   East|Richard| Pencil|   35|      4.99|
|24-Aug-2014|   West|  James|   Desk|    3|     275.0|
| 1-Sep-2014|Central|  Smith|   Desk|    2|     125.0|
|10-Sep-2014|Central|   Bill| Pencil|    7|      1.29|
|18-Sep-2014|   East|Richard|Pen Set|   16|     15.99|
|27-Sep-2014|   West|  James|    Pen|   76|      1.99|
| 5-Oct-2014|Central| Morgan| Binder|   28|      8.99|
|14-Oct-2014|   West| Thomas| Binder|   57|     19.99|
|22-Oct-2014|   East|Richard|    Pen|   64|      8.99|
|31-Oct-2014|Central| Rachel| Pencil|   14|      1.29|
| 8-Nov-20

43

In [66]:
from pyspark.sql.functions import to_timestamp, year, month

# Convert string column, "OrderDate" into DateTime
d = officeSupply_df.select(officeSupply_df.Region, officeSupply_df.Rep, \
                          officeSupply_df.Item, officeSupply_df.Units, officeSupply_df.columns[5], \
                          to_timestamp(officeSupply_df.OrderDate, 'dd-MMM-yyyy').alias('OrderPlaced'))

#extract year and month from OrderPlaced
d = d.select(d.Region, d.Rep, d.Item, d.Units, d.columns[4], d.OrderPlaced, year(d.OrderPlaced).alias('year'), \
             month(d.OrderPlaced).alias('month') \
            )
             
d.show()
d.describe("Unit Price")

d.filter(d.month == 9).show()
d.filter(d.month == 11).count()
d.filter((d.Rep=='Richard') | (d.Rep=='Alex')).show()
d.groupby("Region").count().show()

d.groupby("Item", "Unit Price").pivot("month").sum("Unit Price").show()
#d = officeSupply_df.select(to_timestamp(officeSupply_df.OrderDate, 'dd-MMM-yyyy').alias('dt'))
#print(d)
#d.printSchema()
#Convert string to datetime and extract only month part
#d = officeSupply_df.select(officeSupply_df.Region, officeSupply_df.Item, officeSupply_df.Units,month(to_timestamp(officeSupply_df.OrderDate, 'dd-MMM-yyyy')).alias('dt') )
#d.show()

+-------+-------+-------+-----+----------+-------------------+----+-----+
| Region|    Rep|   Item|Units|Unit Price|        OrderPlaced|year|month|
+-------+-------+-------+-----+----------+-------------------+----+-----+
|   East|Richard|Pen Set|   62|      4.99|2014-07-04 00:00:00|2014|    7|
|   East|   Nick| Binder|   29|      1.99|2014-07-12 00:00:00|2014|    7|
|Central| Morgan|Pen Set|   55|     12.49|2014-07-21 00:00:00|2014|    7|
|   East|  Susan| Binder|   81|     19.99|2014-07-29 00:00:00|2014|    7|
|Central|Matthew|Pen Set|   42|     23.95|2014-08-07 00:00:00|2014|    8|
|   East|Richard| Pencil|   35|      4.99|2014-08-15 00:00:00|2014|    8|
|   West|  James|   Desk|    3|     275.0|2014-08-24 00:00:00|2014|    8|
|Central|  Smith|   Desk|    2|     125.0|2014-09-01 00:00:00|2014|    9|
|Central|   Bill| Pencil|    7|      1.29|2014-09-10 00:00:00|2014|    9|
|   East|Richard|Pen Set|   16|     15.99|2014-09-18 00:00:00|2014|    9|
|   West|  James|    Pen|   76|      1