### Core concepts

In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

!ls

import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Waiting for headers] [1 I0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Waiting for headers] [Con                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Waiting for headers] [Con0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.91.39)]                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:6 https://developer.download.nv

In [2]:
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql import types

In [77]:
#  Other types of stored data like parquet, text and json can be loaded the same way as csv
df = spark.read.csv("original.csv", header=True)
df.show(5)

+---+----------+----------+------+-------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+-------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|
+---+----------+----------+------+-------------+--------------------+---------+----------+-----------+
only showing top 5 rows



Every column has string data type by default:

In [78]:
df.dtypes

[('id', 'string'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('City', 'string'),
 ('JobTitle', 'string'),
 ('Salary', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'string')]

To include other data types we should introduce a scheme:

In [79]:
schema = types.StructType([
                           types.StructField("id", types.IntegerType()),
                           types.StructField("first_name", types.StringType()),
                           types.StructField("last_name", types.StringType()),
                           types.StructField("gender", types.StringType()),
                           types.StructField("City", types.StringType()),
                           types.StructField("JobTitle", types.StringType()),
                           types.StructField("Salary", types.StringType()),
                           types.StructField("Latitude", types.FloatType()),
                           types.StructField("Longitude", types.FloatType()),
])

df2 = spark.read.csv("original.csv", header=True, schema=schema)
df2.show(5)

+---+----------+----------+------+-------------+--------------------+---------+---------+----------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|
|  3|    Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|     null| 37.648994|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+
only showing top 5 rows



In [80]:
df2.dtypes

[('id', 'int'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('City', 'string'),
 ('JobTitle', 'string'),
 ('Salary', 'string'),
 ('Latitude', 'float'),
 ('Longitude', 'float')]

G

In [81]:
df2.head(5)

[Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', City='Nowa Ruda', JobTitle='Assistant Professor', Salary='$57438.18', Latitude=50.57740783691406, Longitude=16.49671745300293),
 Row(id=2, first_name='Kimberly', last_name='Von Welden', gender='Female', City='Bulgan', JobTitle='Programmer II', Salary='$62846.60', Latitude=48.823158264160156, Longitude=103.52182006835938),
 Row(id=3, first_name='Alvera', last_name='Di Boldi', gender='Female', City=None, JobTitle=None, Salary='$57576.52', Latitude=39.994747161865234, Longitude=116.33977508544922),
 Row(id=4, first_name='Shannon', last_name="O'Griffin", gender='Male', City='Divnomorskoye', JobTitle='Budget/Accounting Analyst II', Salary='$61489.23', Latitude=44.504722595214844, Longitude=38.1300163269043),
 Row(id=5, first_name='Sherwood', last_name='Macieja', gender='Male', City='Mytishchi', JobTitle='VP Sales', Salary='$63863.09', Latitude=None, Longitude=37.64899444580078)]

General info

In [82]:
df2.describe().show()

+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|summary|               id|first_name|last_name|gender|               City|           JobTitle|   Salary|          Latitude|        Longitude|
+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|  count|             1000|      1000|     1000|  1000|                999|                998|     1000|               999|             1000|
|   mean|            500.5|      null|     null|  null|               null|               null|     null| 25.43151724702484|43.33756460386515|
| stddev|288.8194360957494|      null|     null|  null|               null|               null|     null|24.579082550156635| 69.4206453674681|
|    min|                1|   Abagail|    Abbay|Female|             Abéché|Account Coordinator|$10101.92|         -54.28115|       -123.04196|

In [83]:
df2.first()

Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', City='Nowa Ruda', JobTitle='Assistant Professor', Salary='$57438.18', Latitude=50.57740783691406, Longitude=16.49671745300293)

In [84]:
df.columns

['id',
 'first_name',
 'last_name',
 'gender',
 'City',
 'JobTitle',
 'Salary',
 'Latitude',
 'Longitude']

Handling nulls and duplicates

In [85]:
df.count()

1000

In [86]:
df.distinct().count()

1000

In [87]:
# Dropp entire row if there are any nulls
df_dropped = df2.na.drop()
df_dropped.show(5)

+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary| Latitude| Longitude|
+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.426613|-6.1644998|
|  8|   Goddart|     Flear|  Male|      Trélissac|Desktop Support T...|$46116.36|45.190517| 0.7423124|
+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
only showing top 5 rows



In [88]:
df_null_jobs = df2.filter(df2.JobTitle.isNotNull())
df_null_jobs.show(5)

+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary| Latitude| Longitude|
+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|     null| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.426613|-6.1644998|
+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
only showing top 5 rows



In [89]:
df_handled = df2.withColumn("Clean_City", 
                            f.when(df2.City.isNull(), "Unknown").otherwise(df2.City)
)
df_handled.show(5)

+---+----------+----------+------+-------------+--------------------+---------+---------+----------+-------------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|   Clean_City|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+-------------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|    Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|       Bulgan|
|  3|    Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|      Unknown|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|     null| 37.648994|    Mytishchi|
+---+----------+----------+------+-------------+--------------------+---------+-

Selecting & filtering data

In [90]:
df_select = df2.select("first_name", "last_name")
df_select.show(5)

+----------+----------+
|first_name| last_name|
+----------+----------+
|   Melinde| Shilburne|
|  Kimberly|Von Welden|
|    Alvera|  Di Boldi|
|   Shannon| O'Griffin|
|  Sherwood|   Macieja|
+----------+----------+
only showing top 5 rows



In [91]:
df_renamed = df2.withColumnRenamed('first_name', 'fn')
df_renamed.show(5)

+---+--------+----------+------+-------------+--------------------+---------+---------+----------+
| id|      fn| last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|
+---+--------+----------+------+-------------+--------------------+---------+---------+----------+
|  1| Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|
|  2|Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|
|  3|  Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|
|  4| Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|
|  5|Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|     null| 37.648994|
+---+--------+----------+------+-------------+--------------------+---------+---------+----------+
only showing top 5 rows



In [92]:
df_filter = df2.filter((df2.first_name == "Sherwood"))
df_filter.show()

+---+----------+---------+------+---------+--------------------+---------+---------+---------+
| id|first_name|last_name|gender|     City|            JobTitle|   Salary| Latitude|Longitude|
+---+----------+---------+------+---------+--------------------+---------+---------+---------+
|  5|  Sherwood|  Macieja|  Male|Mytishchi|            VP Sales|$63863.09|     null|37.648994|
|729|  Sherwood|   Misson|  Male| Guanyang|Payment Adjustmen...|$56567.31|25.489384|111.16085|
+---+----------+---------+------+---------+--------------------+---------+---------+---------+



In [93]:
df_filter = df2.filter((df2.first_name.like("%am")))
df_filter.show()

+---+----------+---------+------+----------------+--------------------+---------+----------+---------+
| id|first_name|last_name|gender|            City|            JobTitle|   Salary|  Latitude|Longitude|
+---+----------+---------+------+----------------+--------------------+---------+----------+---------+
| 61|      Adam|  Seagood|  Male|     Krajan Siki|  Research Associate|$28660.11|   -8.1831| 111.5359|
|108|    Miriam|    Jertz|Female|           Hekou|    Quality Engineer|$27902.32| 22.529404|103.93935|
|145|       Tam|    Elgey|  Male|Margahayukencana|    Dental Hygienist|$74381.32| -6.971499|107.56938|
|402|     Avram|   Extill|  Male|        Litvínov|Analog Circuit De...|$35105.35| 50.596558|13.592975|
|412|      Gram|   Jaeggi|  Male|            Boto|Safety Technician II|$42845.12|-7.6155877|110.71671|
+---+----------+---------+------+----------------+--------------------+---------+----------+---------+



In [94]:
df_filter = df2.filter((df2.first_name.endswith("in")))
df_filter.show(5)

+---+----------+---------+------+--------------+--------------------+---------+----------+---------+
| id|first_name|last_name|gender|          City|            JobTitle|   Salary|  Latitude|Longitude|
+---+----------+---------+------+--------------+--------------------+---------+----------+---------+
| 18|     Thain|   Habbon|  Male|Foros do Trapo|     Design Engineer|$42135.67| 38.696247|-8.709834|
| 81|     Alvin|    Doman|  Male|          Niny|Research Assistant I|$53258.86| 44.486843|43.940807|
|223|    Garvin| Conisbee|  Male|         Dijon| Associate Professor|$81452.92|  45.29776|2.5135984|
|327|   Marilin|    Bride|Female| Nambak Tengah|   Account Executive|$92048.68|-4.8919744|  105.278|
|390|     Arlin|   Frayne|  Male|      Zhizhong|      Accountant III|$10913.50|   40.0873|116.31098|
+---+----------+---------+------+--------------+--------------------+---------+----------+---------+
only showing top 5 rows



In [95]:
df_filter = df2.filter((df2.first_name.startswith("Al")))
df_filter.show(5)

+---+----------+---------+------+-------------+--------------------+---------+---------+----------+
| id|first_name|last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|
+---+----------+---------+------+-------------+--------------------+---------+---------+----------+
|  3|    Alvera| Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|
| 21|      Alon| Chasteau|  Male|        Xin’e|     Web Developer I|$62755.85| 49.16291| 127.98658|
| 81|     Alvin|    Doman|  Male|         Niny|Research Assistant I|$53258.86|44.486843| 43.940807|
|101|     Alene|      Odd|Female|María la Baja|Accounting Assist...|$37379.03|  9.91416| -75.41116|
|115|    Allard|   Cordel|  Male|    Mieścisko|Compensation Analyst|$18907.81|45.863018|  5.947966|
+---+----------+---------+------+-------------+--------------------+---------+---------+----------+
only showing top 5 rows



In [96]:
df_substr = df2.select(df2.first_name, df2.first_name.substr(1, 5).alias("subsr"))
df_substr.show(5)

+----------+-----+
|first_name|subsr|
+----------+-----+
|   Melinde|Melin|
|  Kimberly|Kimbe|
|    Alvera|Alver|
|   Shannon|Shann|
|  Sherwood|Sherw|
+----------+-----+
only showing top 5 rows



Applying multiple filters

In [97]:
df_filtered = df2.filter(((df2.first_name.endswith("in")) & (df2.City.like("%on"))))
df_filtered.show()

+---+----------+-------------+------+-----------+-------------------+---------+----------+---------+
| id|first_name|    last_name|gender|       City|           JobTitle|   Salary|  Latitude|Longitude|
+---+----------+-------------+------+-----------+-------------------+---------+----------+---------+
|223|    Garvin|     Conisbee|  Male|      Dijon|Associate Professor|$81452.92|  45.29776|2.5135984|
|901|     Aldin|Matuszkiewicz|  Male|East London|           Operator|$41468.83|-32.954933|27.931913|
+---+----------+-------------+------+-----------+-------------------+---------+----------+---------+



In [98]:
df_filtered = df2.filter(((df2.first_name.isin("Alvin", "Mike")) | (df2.City.isin("London"))))
df_filtered.show()

+---+----------+-----------+------+---------+--------------------+---------+---------+---------+
| id|first_name|  last_name|gender|     City|            JobTitle|   Salary| Latitude|Longitude|
+---+----------+-----------+------+---------+--------------------+---------+---------+---------+
| 81|     Alvin|      Doman|  Male|     Niny|Research Assistant I|$53258.86|44.486843|43.940807|
|897|      Mike|Shillington|  Male|Torslanda|    Quality Engineer|$66504.42| 57.75504|11.814933|
+---+----------+-----------+------+---------+--------------------+---------+---------+---------+



Running SQL on dataframes

In [99]:
# Create a temp table for SQL queries
df2.registerTempTable("original")

In [100]:
query1 = spark.sql("select distinct City from original")
query1.show()

+-----------------+
|             City|
+-----------------+
|        Sułkowice|
|          Klippan|
|      Trollhättan|
|        Shinaihai|
|         Hongzhou|
|         Cipinang|
| Viejo Daan Banua|
|         Tsiatsan|
|       San Andres|
|           Krasna|
|      Springfield|
|            Město|
|Chaloem Phra Kiat|
|          Tadotsu|
|   Hénin-Beaumont|
|          Kajaani|
|           Duozhu|
|           Abéché|
|     Habingkloang|
|         Malishka|
+-----------------+
only showing top 20 rows



In [101]:
query2 = spark.sql("select concat(first_name, ' ', last_name) as full_name from original where gender = 'Female' ")
query2.show(5)

+-------------------+
|          full_name|
+-------------------+
|  Melinde Shilburne|
|Kimberly Von Welden|
|    Alvera Di Boldi|
|         Maris Folk|
|       Masha Divers|
+-------------------+
only showing top 5 rows



Adding calculated columns

In [102]:
df2 = df2.withColumn("clean_salary", df2.Salary.substr(2, 50).cast("float"))
df2.show(5)

+---+----------+----------+------+-------------+--------------------+---------+---------+----------+------------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|clean_salary|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+------------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|    57438.18|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|     62846.6|
|  3|    Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|    57576.52|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|    61489.23|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|     null| 37.648994|    63863.09|
+---+----------+----------+------+-------------+--------------------+---------+---------

In [103]:
df2 = df2.withColumn("monthly_salary", df2.clean_salary / 12)
df2.show(5)

+---+----------+----------+------+-------------+--------------------+---------+---------+----------+------------+-----------------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|clean_salary|   monthly_salary|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+------------+-----------------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|    57438.18|4786.514973958333|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|     62846.6|   5237.216796875|
|  3|    Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|    57576.52|4798.043294270833|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|    61489.23|  5124.1025390625|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.

In [104]:
df2 = df2.withColumn('is_female', f.when(df2.gender == 'Female', 1).otherwise(0).cast("int"))
df2.show(5)

+---+----------+----------+------+-------------+--------------------+---------+---------+----------+------------+-----------------+---------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|clean_salary|   monthly_salary|is_female|
+---+----------+----------+------+-------------+--------------------+---------+---------+----------+------------+-----------------+---------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|    57438.18|4786.514973958333|        1|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|     62846.6|   5237.216796875|        1|
|  3|    Alvera|  Di Boldi|Female|         null|                null|$57576.52|39.994747|116.339775|    57576.52|4798.043294270833|        1|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|    61489.23|  5124.1025390625|        0|
|  5| 

Data aggregation

In [119]:
df_grouped = df2.groupBy("gender").agg(f.avg("monthly_salary").alias("avg_monthly_salary"),
                                       f.min("monthly_salary").alias("min_monthly_salary"),
                                       f.max("monthly_salary").alias("max_monthly_salary"),
                                       f.stddev("monthly_salary").alias("std_monthly_salary"))
df_grouped.show()

+------+------------------+------------------+------------------+------------------+
|gender|avg_monthly_salary|min_monthly_salary|max_monthly_salary|std_monthly_salary|
+------+------------------+------------------+------------------+------------------+
|Female| 4634.911915683484|  884.703369140625|      8329.0234375| 2217.177580829649|
|  Male| 4613.424487977516|   841.82666015625| 8328.576822916666|  2094.35177666598|
+------+------------------+------------------+------------------+------------------+



In [123]:
df_grouped = df2.groupBy("City","gender").agg(f.avg("monthly_salary").alias("avg_monthly_salary"),
                                       f.min("monthly_salary").alias("min_monthly_salary"),
                                       f.max("monthly_salary").alias("max_monthly_salary"),
                                       f.count("monthly_salary").alias("n_obs"))
df_grouped.show()

+------------+------+------------------+------------------+------------------+-----+
|        City|gender|avg_monthly_salary|min_monthly_salary|max_monthly_salary|n_obs|
+------------+------+------------------+------------------+------------------+-----+
|   Kaustinen|Female| 7093.397786458333| 7093.397786458333| 7093.397786458333|    1|
|  Sungai Iyu|Female| 6678.024088541667| 6678.024088541667| 6678.024088541667|    1|
|      Toledo|  Male|   1230.1591796875|   1230.1591796875|   1230.1591796875|    1|
|    Slobodka|Female|1842.9641927083333|1842.9641927083333|1842.9641927083333|    1|
|  Kharagauli|  Male| 6603.994791666667| 6603.994791666667| 6603.994791666667|    1|
|    Rockford|  Male| 5877.270182291667| 5877.270182291667| 5877.270182291667|    1|
|    Floresta|Female|      7047.2421875|      7047.2421875|      7047.2421875|    1|
|      Gaoshi|  Male|   841.82666015625|   841.82666015625|   841.82666015625|    1|
|  Tsaghkunk’|Female| 6928.616536458333| 6928.616536458333| 6928.

Writing dataframes to files

In [124]:
df_grouped.write.csv("df_grouped.csv")
df_grouped.write.parquet("df_grouped.parquet")
df_grouped.write.json("df_grouped.json")