##### A SparkSession can be used create DataFrame, register DataFrame as tables, execute SQL over tables, cache tables, and read parquet files. To create a SparkSession, use the builder pattern

### a- builder pattern

In [1]:
# la "spark session" est automatiquement instancié dans la variable globale spark
print spark

<pyspark.sql.session.SparkSession object at 0x7f8bd334fb10>


In [2]:
spark

In [3]:
spark.conf.get("spark.driver.maxResultSize",default="not_defined")

u'not_defined'

In [4]:
spark.conf.get("spark.sql.shuffle.partitions",default='-1')

u'-1'

In [5]:
spark = (SparkSession
         .builder 
         .master("local") # le master est inmodfiable après la démarrage de l'appli, et cette ligne est inutile
         .appName("sparksql_train")  # le nom de l'application est inmodifiable après la démarrage de l'appli,  et cette ligne est inutile
         .config("spark.driver.maxResultSize", 0) # any (key, value) config related to spark sql 
         .config("spark.sql.shuffle.partitions", 8) # any (key, value) config related to spark sql 
        .getOrCreate() 
         )

In [6]:
spark

In [7]:
spark.conf.get("spark.driver.maxResultSize",default="not_defined")

u'0'

In [8]:
spark.conf.get("spark.sql.shuffle.partitions",default='-1')

u'8'

### b - create a dataframe 

###### b -1 - from a list of tuples (schema : types infered but with default column names )

In [11]:
l = [('Alice', 1),('Bob',2)]
df=spark.createDataFrame(l)
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)



In [12]:
df.collect()

[Row(_1=u'Alice', _2=1), Row(_1=u'Bob', _2=2)]

##### b - 2 - from a list of tuples (schema : types infered and defining column names via "schema" key work)


In [14]:
from pyspark.sql.types import *

l = [('Alice', 1),('Bob',2)]
# df=spark.createDataFrame(l,schema=['name','age'])

schema1 = ['name','age']  #schema sous forme de liste de noms de colonnes (sans préciser les types)

schema2 = StructType([     #schema sous forme d'un pyspark.sql.types.StructType
                   StructField("name", StringType(), True),
                  StructField("age", IntegerType(), True)]
                    )

print schema1
print schema2


['name', 'age']
StructType(List(StructField(name,StringType,true),StructField(age,IntegerType,true)))


In [15]:
df=spark.createDataFrame(l,schema1)
df.printSchema()
df.collect()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



[Row(name=u'Alice', age=1), Row(name=u'Bob', age=2)]

In [16]:
df=spark.createDataFrame(l,schema2)
df.printSchema()
df.collect()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



[Row(name=u'Alice', age=1), Row(name=u'Bob', age=2)]

##### b - 3 - from a list of dicts (schema : types infered and with defined column names in dict.values() )

In [17]:
# deprecated --> use list of rows rather than list of dicts
d = [{'name': 'Alice', 'age': 1}]
df=spark.createDataFrame(d)
df.printSchema()


root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)





In [18]:
print df.collect()

[Row(age=1, name=u'Alice')]


##### b - 4 - from a list of rows (schema : types infered and with defined column names  inside of rows)


In [19]:
from pyspark.sql import Row
d =[Row(name="Alice", age=1)]
df=spark.createDataFrame(d)
df.printSchema()


root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [20]:
df.collect()

[Row(age=1, name=u'Alice')]

##### b - 5 -  from a pandas dataframe

In [21]:
# from a pandas dataframe
import pandas as pd 
df_pandas=pd.DataFrame([['alice', 2],['bob', 9]],columns=['name','age'])
df_pandas

Unnamed: 0,name,age
0,alice,2
1,bob,9


In [22]:
df=spark.createDataFrame(df_pandas)
df.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [23]:
df.toPandas()

Unnamed: 0,name,age
0,alice,2
1,bob,9


##### b - 6 -  from an rdd

In [24]:
l = [('Alice', 1)]
rdd = sc.parallelize(l)
df=spark.createDataFrame(rdd,schema=['name','age'])


In [25]:
df.collect()

[Row(name=u'Alice', age=1)]

### c - launch SQL declarative code

In [27]:
from pyspark.sql import Row
d =[Row(name="Alice", age=1)]
df=spark.createDataFrame(d)
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [28]:
df.createOrReplaceTempView("table")
df2 = spark.sql("SELECT name AS n, age as a from table")
df2.printSchema()

root
 |-- n: string (nullable = true)
 |-- a: long (nullable = true)



In [29]:
df2.createOrReplaceTempView("table2")


In [30]:
df3 = spark.table("table2")
df3.printSchema()   

root
 |-- n: string (nullable = true)
 |-- a: long (nullable = true)



#### d - range functionv

In [31]:
df=spark.range(1, 9, 2,numPartitions=2) # start, end (exclusive), step , num partitions
df.printSchema()
df.collect()

root
 |-- id: long (nullable = false)



[Row(id=1), Row(id=3), Row(id=5), Row(id=7)]

In [32]:
df=spark.range(5) # if one argument is given --> end value 
df.printSchema()
df.collect()

root
 |-- id: long (nullable = false)



[Row(id=0), Row(id=1), Row(id=2), Row(id=3), Row(id=4)]

#### e - get DataFrameReader

In [33]:
print type(spark.read)

<class 'pyspark.sql.readwriter.DataFrameReader'>


In [34]:
#####  1-   read CSV file in the used storage system via its path in this system
# spark.read.csv('path/to/my_file/in/the/used/disributed/storage/system/my_file.csv')

In [36]:
#####  2-   read JSON file in the used storage system via its path in this system
# df = spark.read.format('json').load('path/to/my_file/in/the/used/disributed/storage/system/my_file.json')

In [37]:
print type(spark.read)
print type(spark.read.format('json'))

<class 'pyspark.sql.readwriter.DataFrameReader'>
<class 'pyspark.sql.readwriter.DataFrameReader'>


In [38]:
#####  3-   read PARQUET file in the used storage system via its path in this system
# df = spark.read.parquet('path/to/my_file/in/the/used/disributed/storage/system/my_file.parquet')