## **Create simple dataframe**

In [2]:
data = [("United States","US",40,44,42,126),
("China","CHN",40,27,24,91),
("Japan","JPN",20,12,13,45),
("Australia","AUS",18,19,16,53),
("France","FRA",16,26,22,64),
("Netherlands","NED",15,7,12,34),
("Great Britain","GBG",14,22,29,65),
("South Korea","KOR",13,9,10,32),
("Italy","ITA",12,13,15,40),
("Germany","GER",12,13,8,33),
("New Zealand","NZ",10,7,3,20)]

columns = ["country","code","gold","Bronze","silver","total"]
df  = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show(truncate=False)

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 4, Finished, Available, Finished)

root
 |-- country: string (nullable = true)
 |-- code: string (nullable = true)
 |-- gold: long (nullable = true)
 |-- Bronze: long (nullable = true)
 |-- silver: long (nullable = true)
 |-- total: long (nullable = true)

+-------------+----+----+------+------+-----+
|country      |code|gold|Bronze|silver|total|
+-------------+----+----+------+------+-----+
|United States|US  |40  |44    |42    |126  |
|China        |CHN |40  |27    |24    |91   |
|Japan        |JPN |20  |12    |13    |45   |
|Australia    |AUS |18  |19    |16    |53   |
|France       |FRA |16  |26    |22    |64   |
|Netherlands  |NED |15  |7     |12    |34   |
|Great Britain|GBG |14  |22    |29    |65   |
|South Korea  |KOR |13  |9     |10    |32   |
|Italy        |ITA |12  |13    |15    |40   |
|Germany      |GER |12  |13    |8     |33   |
|New Zealand  |NZ  |10  |7     |3     |20   |
+-------------+----+----+------+------+-----+



##  Understand the data from type

In [3]:
type(df)

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 5, Finished, Available, Finished)

pyspark.sql.dataframe.DataFrame

## DataFrame using StructType

In [6]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
data = [("United States","US",40,44,42,126),
("China","CHN",40,27,24,91),
("Japan","JPN",20,12,13,45),
("Australia","AUS",18,19,16,53),
("France","FRA",16,26,22,64),
("Netherlands","NED",15,7,12,34),
("Great Britain","GBG",14,22,29,65),
("South Korea","KOR",13,9,10,32),
("Italy","ITA",12,13,15,40),
("Germany","GER",12,13,8,33),
("New Zealand","NZ",10,7,3,20)]

columns = StructType ([
    StructField("country",StringType(),True),
    StructField("Code",StringType(),True),
    StructField("Gold", IntegerType(), True),
    StructField("Sliver", IntegerType(), True),
    StructField("Bronze", IntegerType(), True),
    StructField("Total", IntegerType(), True)
])
df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 8, Finished, Available, Finished)

root
 |-- country: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- Gold: integer (nullable = true)
 |-- Sliver: integer (nullable = true)
 |-- Bronze: integer (nullable = true)
 |-- Total: integer (nullable = true)

+-------------+----+----+------+------+-----+
|country      |Code|Gold|Sliver|Bronze|Total|
+-------------+----+----+------+------+-----+
|United States|US  |40  |44    |42    |126  |
|China        |CHN |40  |27    |24    |91   |
|Japan        |JPN |20  |12    |13    |45   |
|Australia    |AUS |18  |19    |16    |53   |
|France       |FRA |16  |26    |22    |64   |
|Netherlands  |NED |15  |7     |12    |34   |
|Great Britain|GBG |14  |22    |29    |65   |
|South Korea  |KOR |13  |9     |10    |32   |
|Italy        |ITA |12  |13    |15    |40   |
|Germany      |GER |12  |13    |8     |33   |
|New Zealand  |NZ  |10  |7     |3     |20   |
+-------------+----+----+------+------+-----+

root
 |-- country: string (nullable = true)
 |-- Code: string (nullable = t

## Data Definition Language

In [7]:
data = [("United States","US",40,44,42,126),
("China","CHN",40,27,24,91),
("Japan","JPN",20,12,13,45),
("Australia","AUS",18,19,16,53),
("France","FRA",16,26,22,64),
("Netherlands","NED",15,7,12,34),
("Great Britain","GBG",14,22,29,65),
("South Korea","KOR",13,9,10,32),
("Italy","ITA",12,13,15,40),
("Germany","GER",12,13,8,33),
("New Zealand","NZ",10,7,3,20)]
columns_ddl = "Country STRING, Code STRING, Gold INT,Sliver INT,Bronze INT, Total INT"
df_with_ddl_schema = spark.createDataFrame(data=data,schema=columns_ddl)
df.printSchema()
df.show(truncate=False)

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 9, Finished, Available, Finished)

root
 |-- country: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- Gold: integer (nullable = true)
 |-- Sliver: integer (nullable = true)
 |-- Bronze: integer (nullable = true)
 |-- Total: integer (nullable = true)

+-------------+----+----+------+------+-----+
|country      |Code|Gold|Sliver|Bronze|Total|
+-------------+----+----+------+------+-----+
|United States|US  |40  |44    |42    |126  |
|China        |CHN |40  |27    |24    |91   |
|Japan        |JPN |20  |12    |13    |45   |
|Australia    |AUS |18  |19    |16    |53   |
|France       |FRA |16  |26    |22    |64   |
|Netherlands  |NED |15  |7     |12    |34   |
|Great Britain|GBG |14  |22    |29    |65   |
|South Korea  |KOR |13  |9     |10    |32   |
|Italy        |ITA |12  |13    |15    |40   |
|Germany      |GER |12  |13    |8     |33   |
|New Zealand  |NZ  |10  |7     |3     |20   |
+-------------+----+----+------+------+-----+



In [9]:
df.show()

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 11, Finished, Available, Finished)

+-------------+----+----+------+------+-----+
|      country|Code|Gold|Sliver|Bronze|Total|
+-------------+----+----+------+------+-----+
|United States|  US|  40|    44|    42|  126|
|        China| CHN|  40|    27|    24|   91|
|        Japan| JPN|  20|    12|    13|   45|
|    Australia| AUS|  18|    19|    16|   53|
|       France| FRA|  16|    26|    22|   64|
|  Netherlands| NED|  15|     7|    12|   34|
|Great Britain| GBG|  14|    22|    29|   65|
|  South Korea| KOR|  13|     9|    10|   32|
|        Italy| ITA|  12|    13|    15|   40|
|      Germany| GER|  12|    13|     8|   33|
|  New Zealand|  NZ|  10|     7|     3|   20|
+-------------+----+----+------+------+-----+



In [10]:
display(df)

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f5b3940b-01f9-438c-88e2-44b019780e61)

In [11]:
display(df.head(2))

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 30ec81e2-67dc-476b-8098-4cfe93344cc1)

In [12]:
display(df.tail(2))

StatementMeta(, 61fa09d0-e761-4a9e-a0e7-ca2de9d73997, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 3948606a-bb9f-4776-87d2-3be6d4a96d0e)