Run this notebook from Command line - pyspark

In [2]:
sc

In [3]:
from pyspark.sql import Row
from datetime import datetime

In [3]:
simple_data = sc.parallelize([1,"Alice", 50])
simple_data

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175

In [4]:
simple_data.count()

3

In [5]:
simple_data.first()

1

In [6]:
simple_data.take(3)

[1, 'Alice', 50]

In [7]:
simple_data.collect()

[1, 'Alice', 50]

In [8]:
records = sc.parallelize([[1,"Alice", 50], [2, "Bob", 80]])

In [9]:
records.collect()

[[1, 'Alice', 50], [2, 'Bob', 80]]

In [10]:
records.count()

2

In [11]:
df = records.toDF()

In [12]:
df

DataFrame[_1: bigint, _2: string, _3: bigint]

In [13]:
df.show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 80|
+---+-----+---+



In [14]:
data = sc.parallelize([Row(id=1, name="Alice", score=50),
      Row(
        id=2,
        name="Bob",
        score=80    
    ),
      Row(
    id=3,
    name="Charles",
    score=100    
    )]
       )    
                      
data

ParallelCollectionRDD[18] at parallelize at PythonRDD.scala:175

In [15]:
df = data.toDF()
df.show()

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   80|
|  3|Charles|  100|
+---+-------+-----+



In [16]:
complex_data = sc.parallelize([
    Row(
    col_float=1.44,
    col_integer=10,
    col_string="John",
    col_list=[1,2,3],
    col_row=Row(a=10, b=20),
    col_dict={"k1":0, "K2":1},
    col_time=datetime(2014,8,2,15,1,5)
)])

In [17]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+------------------+---------+-----------+---------+--------+----------+-------------------+
|          col_dict|col_float|col_integer| col_list| col_row|col_string|           col_time|
+------------------+---------+-----------+---------+--------+----------+-------------------+
|[k1 -> 0, K2 -> 1]|     1.44|         10|[1, 2, 3]|[10, 20]|      John|2014-08-02 15:01:05|
+------------------+---------+-----------+---------+--------+----------+-------------------+



In [18]:
complex_data = sc.parallelize([
    Row(
        col_float=1.44,
        col_integer=10,
        col_string="John",
        col_list=[1,2,3],
        col_row=Row(a=10, b=20),
        col_dict={"k1":0, "K2":1},
        col_time=datetime(2014,8,2,15,1,5)
    ),
    Row(
        col_float=1.44,
        col_integer=10,
        col_string="John",
        col_list=[1,2,3],
        col_row=Row(a=10, b=20),
        col_dict={"k1":0, "K2":1},
        col_time=datetime(2014,8,2,15,1,5)
    )
])

In [19]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+------------------+---------+-----------+---------+--------+----------+-------------------+
|          col_dict|col_float|col_integer| col_list| col_row|col_string|           col_time|
+------------------+---------+-----------+---------+--------+----------+-------------------+
|[k1 -> 0, K2 -> 1]|     1.44|         10|[1, 2, 3]|[10, 20]|      John|2014-08-02 15:01:05|
|[k1 -> 0, K2 -> 1]|     1.44|         10|[1, 2, 3]|[10, 20]|      John|2014-08-02 15:01:05|
+------------------+---------+-----------+---------+--------+----------+-------------------+



In [20]:
sqlContext = SQLContext(sc)

In [21]:
sqlContext

<pyspark.sql.context.SQLContext at 0x1434456f160>

In [22]:
df = sqlContext.range(5)
df

DataFrame[id: bigint]

In [23]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [24]:
df.count()

5

In [25]:
data = [('Alice', 50), ('Bob', 60), ('Fred', 71)]

In [26]:
sqlContext.createDataFrame(data).show()

+-----+---+
|   _1| _2|
+-----+---+
|Alice| 50|
|  Bob| 60|
| Fred| 71|
+-----+---+



In [27]:
sqlContext.createDataFrame(data, ['Name', 'Age']).show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 50|
|  Bob| 60|
| Fred| 71|
+-----+---+



In [4]:
complex_data = [
    (1.0,
     10, 
     'Alice',
     True,
     [1, 2, 3],
     {"k1":0}, 
     Row(a=1, b=2, c=3),
     datetime(2014, 8, 1, 14, 1, 5)),
     (2.0,
     10, 
     'Bob',
     False,
     [4, 5, 6],
     {"k2":1}, 
     Row(a=2, b=3, c=4),
     datetime(2015, 8, 1, 14, 1, 5))
]

In [41]:
from pyspark.sql import Row
from datetime import datetime

sqlContext.createDataFrame(complex_data).show()

+---+---+-----+-----+---------+---------+---------+-------------------+
| _1| _2|   _3|   _4|       _5|       _6|       _7|                 _8|
+---+---+-----+-----+---------+---------+---------+-------------------+
|1.0| 10|Alice| true|[1, 2, 3]|[k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
|2.0| 10|  Bob|false|[4, 5, 6]|[k2 -> 1]|[2, 3, 4]|2015-08-01 14:01:05|
+---+---+-----+-----+---------+---------+---------+-------------------+



In [42]:
data = sc.parallelize([
    Row(1, "Alice", 50),
    Row(2, "Bob", 60),
    Row(3, "John", 70)
])

In [43]:
column_names = Row('id', 'Name', 'score')
students = data.map(lambda r: column_names(*r))

In [44]:
students


PythonRDD[129] at RDD at PythonRDD.scala:48

In [45]:
students.collect()

[Row(id=1, Name='Alice', score=50),
 Row(id=2, Name='Bob', score=60),
 Row(id=3, Name='John', score=70)]

In [46]:
students_df = sqlContext.createDataFrame(students)
students_df

DataFrame[id: bigint, Name: string, score: bigint]

In [47]:
students_df.show()

+---+-----+-----+
| id| Name|score|
+---+-----+-----+
|  1|Alice|   50|
|  2|  Bob|   60|
|  3| John|   70|
+---+-----+-----+



In [48]:
sqlContext = SQLContext(sc)

In [49]:
sqlContext


<pyspark.sql.context.SQLContext at 0x14344620cc0>

In [50]:
df = sqlContext.range(6)

In [51]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
+---+



In [52]:
df.count()

6

In [54]:
data = [
    Row(1, "Alice", 50),
    Row(2, "Bob", 60),
    Row(3, "John", 70)
]

In [55]:
sqlContext.createDataFrame(data).show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 60|
|  3| John| 70|
+---+-----+---+



In [60]:
sqlContext.createDataFrame(data, ['#','Name', 'Score']).show()

+---+-----+-----+
|  #| Name|Score|
+---+-----+-----+
|  1|Alice|   50|
|  2|  Bob|   60|
|  3| John|   70|
+---+-----+-----+



In [5]:
sqlContext.createDataFrame(complex_data).show()

+---+---+-----+-----+---------+---------+---------+-------------------+
| _1| _2|   _3|   _4|       _5|       _6|       _7|                 _8|
+---+---+-----+-----+---------+---------+---------+-------------------+
|1.0| 10|Alice| true|[1, 2, 3]|[k1 -> 0]|[1, 2, 3]|2014-08-01 14:01:05|
|2.0| 10|  Bob|false|[4, 5, 6]|[k2 -> 1]|[2, 3, 4]|2015-08-01 14:01:05|
+---+---+-----+-----+---------+---------+---------+-------------------+

