In [1]:
from pyspark.sql import (
    functions as f,
    Row,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_struct").getOrCreate()

In [11]:
csv_file_path = "file:///home/jovyan/work/temp_with_date.csv"

# types.StructField(name, dataType, nullable=True, metadata=None)
table_schema = t.StructType([
    t.StructField("city", t.StringType(), True),
    t.StructField("temperature", t.FloatType(), True),
    t.StructField("observed_date", t.StringType(), True)])

In [8]:
df = spark.read.schema(table_schema).csv(csv_file_path)
df.show()
df.printSchema()

+--------------------+-----------+-------------+
|                city|temperature|observed_date|
+--------------------+-----------+-------------+
|                Guam|      -25.0|   2022-03-25|
|                Guam|       39.0|   2022-02-18|
|              Serbia|      -35.0|   2022-08-31|
|       French Guiana|       -6.0|   2022-04-03|
|Falkland Islands ...|      -40.0|   2022-05-26|
|              Brazil|       15.0|   2022-02-11|
|             Tunisia|      -31.0|   2022-09-20|
|            Portugal|        7.0|   2022-01-02|
|                Iran|      -22.0|   2022-08-14|
|           Australia|       -5.0|   2022-01-12|
|              Gambia|       21.0|   2022-08-08|
|               Italy|       31.0|   2022-08-26|
|          Guadeloupe|      -39.0|   2022-07-13|
|        South Africa|      -24.0|   2022-05-02|
|              Malawi|       -6.0|   2022-06-13|
|                Iran|       34.0|   2022-09-28|
|      Norfolk Island|       -5.0|   2022-02-21|
|Lao People's Demo..

In [15]:
avg_temperature = df.groupBy("city").mean("temperature")
avg_temperature.show()

+--------------------+-------------------+
|                city|   avg(temperature)|
+--------------------+-------------------+
|                Chad|  4.333333333333333|
|            Anguilla| -5.714285714285714|
|            Paraguay|               31.0|
|               Macao|              -33.5|
|Heard Island and ...|              -5.75|
|               Yemen|               13.6|
|             Senegal|                4.4|
|              Sweden| -5.666666666666667|
|             Tokelau|-2.6666666666666665|
|            Kiribati|              -6.75|
|French Southern T...|                0.0|
|   Republic of Korea| 13.333333333333334|
|              Guyana|  4.714285714285714|
|             Eritrea|                0.8|
|         Philippines|              -4.75|
|              Jersey|                7.0|
|      Norfolk Island|                3.0|
|               Tonga|              -12.0|
|           Singapore|-2.3333333333333335|
|            Malaysia|-2.3333333333333335|
+----------

In [18]:
# celsius to fahrenheit: (0 C x 9/5) + 32
f_temperature = df.withColumn(
                    "fahrenheit_temperature",
                    (f.col("temperature") * 9/5) + 32
                )
f_temperature.show()

+--------------------+-----------+-------------+----------------------+
|                city|temperature|observed_date|fahrenheit_temperature|
+--------------------+-----------+-------------+----------------------+
|                Guam|      -25.0|   2022-03-25|                 -13.0|
|                Guam|       39.0|   2022-02-18|                 102.2|
|              Serbia|      -35.0|   2022-08-31|                 -31.0|
|       French Guiana|       -6.0|   2022-04-03|                  21.2|
|Falkland Islands ...|      -40.0|   2022-05-26|                 -40.0|
|              Brazil|       15.0|   2022-02-11|                  59.0|
|             Tunisia|      -31.0|   2022-09-20|   -23.799999999999997|
|            Portugal|        7.0|   2022-01-02|                  44.6|
|                Iran|      -22.0|   2022-08-14|    -7.600000000000001|
|           Australia|       -5.0|   2022-01-12|                  23.0|
|              Gambia|       21.0|   2022-08-08|                