In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext()
spark = SparkSession(sparkContext=sc)

## Creating DataFrame from reading Files

In [4]:
mtcars = spark.read.csv(path = '../../data/mtcars.csv',
                       sep=',',
                       encoding="UTF-8",
                       comment=None,
                       header=True,
                       inferSchema=True)

In [6]:
mtcars.show(n=5)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              _c0| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



## Creating DataFrame using createDataFrame function

In [7]:
from pyspark.sql import Row

In [8]:
rdd = sc.parallelize([
    Row(x=[1,2,3],y=['a','b','c']),
    Row(x=[4,5,6],y=['e','f','g'])
])

In [9]:
rdd.collect()

[Row(x=[1, 2, 3], y=['a', 'b', 'c']), Row(x=[4, 5, 6], y=['e', 'f', 'g'])]

In [11]:
df = spark.createDataFrame(rdd)
df.show()

+---------+---------+
|        x|        y|
+---------+---------+
|[1, 2, 3]|[a, b, c]|
|[4, 5, 6]|[e, f, g]|
+---------+---------+



## From Pandas DataFrame

In [12]:
import pandas as pd
pdf = pd.DataFrame({
    'x':[[1,2,3],[4,5,6]],
    "y":[['a','b','c'],['e','f','g']]
})
pdf

Unnamed: 0,x,y
0,"[1, 2, 3]","[a, b, c]"
1,"[4, 5, 6]","[e, f, g]"


In [14]:
df2 = spark.createDataFrame(pdf)
df2.show()

+---------+---------+
|        x|        y|
+---------+---------+
|[1, 2, 3]|[a, b, c]|
|[4, 5, 6]|[e, f, g]|
+---------+---------+



### From a list

In [15]:
my_list = [['a',1],['b',2]]
df3 = spark.createDataFrame(my_list,['letter','number'])
df3.show()

+------+------+
|letter|number|
+------+------+
|     a|     1|
|     b|     2|
+------+------+



In [17]:
df3.dtypes

[('letter', 'string'), ('number', 'bigint')]

In [24]:
df3.describe()

DataFrame[summary: string, letter: string, number: string]

select column: df.colName

* corr(col1,col2)
* cov(col1,col2)
* crosstab(col1,col2)
* describe()

* cube()
* drop()
* groupBy()
* rollup()
* select()
* sort()
* sortWithPartitions()
* orderBy()
* sampleBy()
* toDF()
* withColumn()
* with ColumnRenamed()
* filter(***condtion***)