In [1]:
sc


In [2]:
from pyspark.sql.types import Row
from datetime import datetime

In [3]:
simple_data = sc.parallelize([1, "Alice", 50])
simple_data

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [4]:
simple_data.count()

3

In [5]:
simple_data.first()

1

In [6]:
simple_data.take(2)

[1, 'Alice']

In [8]:
# USE THE .collect METHOD TO SEE ALL THE ELEMENTS OF A RDD
simple_data.collect()

[1, 'Alice', 50]

In [9]:
df = simple_data.toDF()

TypeError: Can not infer schema for type: <class 'int'>

In [11]:
# parallelize WILL CREATE A RDD FROM THIS LIST OF LISTS
records = sc.parallelize([[1, "Alice", 50],[2, "Bob", 100]])
records

ParallelCollectionRDD[9] at parallelize at PythonRDD.scala:195

In [12]:
records.collect()

[[1, 'Alice', 50], [2, 'Bob', 100]]

In [13]:
records.count()

2

In [15]:
records.first()

[1, 'Alice', 50]

In [16]:
records.take(2)

[[1, 'Alice', 50], [2, 'Bob', 100]]

In [17]:
records.take(1)

[[1, 'Alice', 50]]

In [19]:
df = records.toDF()
df

DataFrame[_1: bigint, _2: string, _3: bigint]

In [20]:
# .show WILL DISPLAY THE FIRST 20 ROWS IN A TABULAR FORMAT
df.show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob|100|
+---+-----+---+



In [23]:
# USE OBJECT Row TO SPECIFY COLUMN NAMES TO THE DF GENERATED FROM THE RDD 
data = sc.parallelize([Row(id=1, name="Alice", score=50)])

In [25]:
data

ParallelCollectionRDD[35] at parallelize at PythonRDD.scala:195

In [26]:
data.collect()

[Row(id=1, name='Alice', score=50)]

In [27]:
# RESULT WILL BE A RDD W/1 RECORD
data.count()

1

In [31]:
df = data.toDF()
df.show()

+---+-----+-----+
| id| name|score|
+---+-----+-----+
|  1|Alice|   50|
+---+-----+-----+



In [33]:
data = sc.parallelize([Row(
                            id=1, name="Alice", score=50
                        ),
                      Row(
                          id=2, name="Bob", score=100),
                      Row(
                          id=3, name="Charlee", score=75
 
)])

In [35]:
# STRUCTURED DATA IN A RDD, CAN BE CONVERTED INTO A DF
df = data.toDF()
df.show()

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|  100|
|  3|Charlee|   75|
+---+-------+-----+



In [36]:
complex_data = sc.parallelize([Row(
                                col_float=1.44,
                                col_integer=10,
                                col_string="John")])

In [37]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+---------+-----------+----------+
|col_float|col_integer|col_string|
+---------+-----------+----------+
|     1.44|         10|      John|
+---------+-----------+----------+



In [38]:
complex_data = sc.parallelize([Row(
                                col_float=1.44,
                                col_integer=10,
                                col_string="John",
                                col_boolean=True,
                                col_list=[1,2,3])])

In [39]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+-----------+---------+-----------+---------+----------+
|col_boolean|col_float|col_integer| col_list|col_string|
+-----------+---------+-----------+---------+----------+
|       true|     1.44|         10|[1, 2, 3]|      John|
+-----------+---------+-----------+---------+----------+



In [46]:
complex_data = sc.parallelize([Row(
                                col_list= [1, 2, 3],
                                col_dict={"ki": 0},
                                col_row=Row(a=10, b=20, c=30),
                                col_time=datetime(2014, 8, 1, 14, 1, 5)
                             ),
                                Row(
                                col_list= [1, 2, 3, 4, 5],
                                col_dict={"ki" : 0, "k2" : 1},
                                col_row=Row(a=40, b=50, c=60),
                                col_time=datetime(2014, 8, 1, 14, 1, 6)
                             ),
                                Row(
                                col_list= [1, 2, 3, 4, 5, 6, 7],
                                col_dict={"ki" : 0, "k2" : 1, "k3" : 2},
                                col_row=Row(a=70, b=80, c=90),
                                col_time=datetime(2014, 8, 3, 14, 1, 7)
                                )])

In [47]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+--------------------+--------------------+------------+-------------------+
|            col_dict|            col_list|     col_row|           col_time|
+--------------------+--------------------+------------+-------------------+
|           [ki -> 0]|           [1, 2, 3]|[10, 20, 30]|2014-08-01 14:01:05|
|  [ki -> 0, k2 -> 1]|     [1, 2, 3, 4, 5]|[40, 50, 60]|2014-08-01 14:01:06|
|[k3 -> 2, ki -> 0...|[1, 2, 3, 4, 5, 6...|[70, 80, 90]|2014-08-03 14:01:07|
+--------------------+--------------------+------------+-------------------+



In [48]:
# ALL WORK ABOVE WAS RUN IN THE SPARK CONTEXT (SC), SQL CONTEXT WILL ALLOWS TO RUN SQL QUERIES
sqlContext = SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x11e7158d0>

In [49]:
df = sqlContext.range(5)
df

DataFrame[id: bigint]

In [50]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [51]:
df.count()

5

In [52]:
data = [('Alice',50),
       ('Bob', 100),
       ('Charlee', 150)]

In [53]:
sqlContext.createDataFrame(data).show()

+-------+---+
|     _1| _2|
+-------+---+
|  Alice| 50|
|    Bob|100|
|Charlee|150|
+-------+---+



In [56]:
# ASSIGN COLUMN NAMES TO THE DATA FRAME
sqlContext.createDataFrame(data, ['Name', 'Score']).show()

+-------+-----+
|   Name|Score|
+-------+-----+
|  Alice|   50|
|    Bob|  100|
|Charlee|  150|
+-------+-----+



In [57]:
# WE FIRST CREATE A RDD 
complex_data = [
                (1.0, 
                 10,
                 "Alice", 
                 True, 
                 [1,2, 3],
                 {"ki": 0},
                 Row(a=1, b=2, c=3),
                 datetime(2014, 8, 1, 14, 1, 5)),

                (2.0, 
                 20,
                 "Bob", 
                 True, 
                 [1,2, 3, 4, 5],
                 {"ki" : 0, "k2" : 1},
                 Row(a=1, b=2, c=3),
                 datetime(2014, 8, 1, 14, 1, 5)),
    
                 (3.0, 
                 30,
                 "Charlee", 
                 False, 
                 [1,2, 3, 4, 5, 6],
                 {"ki" : 0, "k2" : 1, "k3" : 2},
                 Row(a=1, b=2, c=3),
                 datetime(2014, 8, 1, 14, 1, 5))
                 ]              

In [58]:
# THEN CONVERT IT TO A DF 
sqlContext.createDataFrame(complex_data).show()

+---+---+-------+-----+------------------+--------------------+---------+-------------------+
| _1| _2|     _3|   _4|                _5|                  _6|       _7|                 _8|
+---+---+-------+-----+------------------+--------------------+---------+-------------------+
|1.0| 10|  Alice| true|         [1, 2, 3]|           [ki -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
|2.0| 20|    Bob| true|   [1, 2, 3, 4, 5]|  [ki -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
|3.0| 30|Charlee|false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, ki -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+---+---+-------+-----+------------------+--------------------+---------+-------------------+



In [60]:
complex_data_df = sqlContext.createDataFrame(complex_data, [
    'col_integer', 
    'col_float', 
    'col_string',
    'col_boolean',
    'col_list',
    'col_dictionary',
    'col_row',
    'col_date_time']
   )
complex_data_df.show()

+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|col_integer|col_float|col_string|col_boolean|          col_list|      col_dictionary|  col_row|      col_date_time|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+
|        1.0|       10|     Alice|       true|         [1, 2, 3]|           [ki -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
|        2.0|       20|       Bob|       true|   [1, 2, 3, 4, 5]|  [ki -> 0, k2 -> 1]|[1, 2, 3]|2014-08-01 14:01:05|
|        3.0|       30|   Charlee|      false|[1, 2, 3, 4, 5, 6]|[k3 -> 2, ki -> 0...|[1, 2, 3]|2014-08-01 14:01:05|
+-----------+---------+----------+-----------+------------------+--------------------+---------+-------------------+

