In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('dataaframe2').getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


# user_data = ['Rachael|25|Baltimore','Sam|32|Nevada','Zack|29|Sacremento','Justin|38|New Jersey']

# user_data_rdd = spark.sparkContext.parallelize(user_data)


# user_data_rdd = user_data_rdd.map(lambda ele: (ele.split('|')[0], int(ele.split('|')[1]), ele.split('|')[2]))

# user_data_df = spark.createDataFrame(user_data_rdd)
# user_data_df.show()

data = [("Rachael",25,"Baltimore"),("Sam",32,"Nevada"),("Justin",38,"New Jersey")]


user_schema = StructType([
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("city",StringType(),True),
])

user_data_df = spark.createDataFrame(data,schema=user_schema)
user_data_df.show()

+-------+---+----------+
|   name|age|      city|
+-------+---+----------+
|Rachael| 25| Baltimore|
|    Sam| 32|    Nevada|
| Justin| 38|New Jersey|
+-------+---+----------+



In [6]:
user_data_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



In [8]:
user_data_df.collect()[0]

Row(name='Rachael', age=25, city='Baltimore')

In [9]:
df1 = user_data_df.filter('age<35')
df1.show()

+-------+---+---------+
|   name|age|     city|
+-------+---+---------+
|Rachael| 25|Baltimore|
|    Sam| 32|   Nevada|
+-------+---+---------+



In [10]:
user_data_df.createOrReplaceTempView('employee')          ### Creating view of a dataframe

df2 = spark.sql("select * from employee where age<35")

sorted(df1.collect()) == sorted(df2.collect())

True

In [20]:
# creating a temp view and adding a new row

user_data_df.createOrReplaceTempView('employee') 
df3 = spark.sql("""create or replace temporary view employee as select * from employee union select 'Alex' as name, 26 as age, 'Ohio' as city""")
df3 = spark.sql("""select * from employee""")
df3.show()

+-------+---+----------+
|   name|age|      city|
+-------+---+----------+
|    Sam| 32|    Nevada|
|   Alex| 26|      Ohio|
| Justin| 38|New Jersey|
|Rachael| 25| Baltimore|
+-------+---+----------+



In [21]:
df3.orderBy(df3.age.desc()).collect()

[Row(name='Justin', age=38, city='New Jersey'),
 Row(name='Sam', age=32, city='Nevada'),
 Row(name='Alex', age=26, city='Ohio'),
 Row(name='Rachael', age=25, city='Baltimore')]

In [4]:
# converting spark df to pandas df 
user_data_df.toPandas()

Unnamed: 0,name,age,city
0,Rachael,25,Baltimore
1,Sam,32,Nevada
2,Justin,38,New Jersey


In [13]:
# import pandas as pd
# import pyarrow

# df_1 = spark.createDataFrame([(1,1.0),(1,2.0),(2,3.0),(2,4.0)],('id','val'))

def mean_f(key,pdf):
    #key is a tuple of id's
    return pd.DataFrame([key + (pdf.val.mean(),)]) 

df_1.groupBy("id").applyInPandas(mean_f,schema='id long, val double').show()

+---+---+
| id|val|
+---+---+
|  1|1.5|
|  2|3.5|
+---+---+



In [7]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-1.0.1-cp36-cp36m-win_amd64.whl (10.5 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-1.0.1


In [31]:
df_2 = spark.createDataFrame(
    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],("time", "id", "v1"))

df_3 = spark.createDataFrame([(20000101, 1, "x"), (20000101, 2, "y")],("time", "id", "v2"))

def asof_merge(d1,d2):
    return pd.merge_asof(d1,d2,on="time",by="id")

df_2.groupBy("id").cogroup(df_3.groupBy("id")).applyInPandas(asof_merge, schema="time int, id int, v1 double, v2 string").show()

+--------+---+---+---+
|    time| id| v1| v2|
+--------+---+---+---+
|20000101|  1|1.0|  x|
|20000102|  1|3.0|  x|
|20000101|  2|2.0|  y|
|20000102|  2|4.0|  y|
+--------+---+---+---+

