# Creating a simple spark Application

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

In [9]:
spark = (SparkSession
    .builder
    .appName("FirstProgram")
    .getOrCreate())

data = spark.createDataFrame([("Amar",21),("Akbar",25),("John",28),("Harika",32),("Amar",35),("Akbar",40)],
                             ["name","age"])

In [10]:
data.show()

+------+---+
|  name|age|
+------+---+
|  Amar| 21|
| Akbar| 25|
|  John| 28|
|Harika| 32|
|  Amar| 35|
| Akbar| 40|
+------+---+



In [11]:
avg_df = data.groupby("name").agg(avg("age"))

In [12]:
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|  Amar|    28.0|
| Akbar|    32.5|
|  John|    28.0|
|Harika|    32.0|
+------+--------+



### DataFrame API

[Spark Python Data Types](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/data_types.html)

### Schemas and Creating DataFrames

* Spark can infer the schema from the data
* If the dataset is large, it will be a overhead for spark to read a portion of the file and ascertain the datatype. This will be expensive and time-consuming
* It is a good practice to define the schema upfront

#### Two ways to define a Schema

In [36]:
author_data = [("John", "Doe", "The Great Gatsby", 1000.00, 5),
        ("Jane", "Smith", "To Kill a Mockingbird", 1200.05, 3),
        ("Bob", "Johnson", "Pride and Prejudice", 800.05, 4),
        ("Alice", "Davis", "The Catcher in the Rye", 900.00, 2),
        ("Charlie", "Brown", "Moby-Dick", 700.30, 6),
        ("Emily", "Wilson", "Wuthering Heights", 1100.05, 1),
        ("Frank", "Garcia", "1984", 1300.06, 7),
        ("Grace", "Martinez", "The Odyssey", 600.00, 3),
        ("Henry", "Anderson", "War and Peace", 1400.75, 8),
        ("Isabella", "Taylor", "The Divine Comedy", 500.00, 2)]

In [37]:
from pyspark.sql.types import *
schema = StructType(
    [StructField("name", StringType(), False),
     StructField("surname", StringType(), False),
     StructField("book", StringType(), False),
     StructField("price", FloatType(), False),
     StructField("rating", IntegerType(), False)
     ])

In [38]:
spark = (SparkSession
         .builder
         .appName("secondProgram")
         .getOrCreate())

In [39]:
author_data = spark.createDataFrame(author_data,schema)

In [40]:
author_data.show()

+--------+--------+--------------------+-------+------+
|    name| surname|                book|  price|rating|
+--------+--------+--------------------+-------+------+
|    John|     Doe|    The Great Gatsby| 1000.0|     5|
|    Jane|   Smith|To Kill a Mocking...|1200.05|     3|
|     Bob| Johnson| Pride and Prejudice| 800.05|     4|
|   Alice|   Davis|The Catcher in th...|  900.0|     2|
| Charlie|   Brown|           Moby-Dick|  700.3|     6|
|   Emily|  Wilson|   Wuthering Heights|1100.05|     1|
|   Frank|  Garcia|                1984|1300.06|     7|
|   Grace|Martinez|         The Odyssey|  600.0|     3|
|   Henry|Anderson|       War and Peace|1400.75|     8|
|Isabella|  Taylor|   The Divine Comedy|  500.0|     2|
+--------+--------+--------------------+-------+------+



In [41]:
author_data.printSchema()

root
 |-- name: string (nullable = false)
 |-- surname: string (nullable = false)
 |-- book: string (nullable = false)
 |-- price: float (nullable = false)
 |-- rating: integer (nullable = false)



In [42]:
author_data.schema

StructType([StructField('name', StringType(), False), StructField('surname', StringType(), False), StructField('book', StringType(), False), StructField('price', FloatType(), False), StructField('rating', IntegerType(), False)])