In [0]:
airlines_df = spark.read\
                .format("csv")\
                    .option("inferSchema", "true")\
                        .option("samplingRatio", 0.0001)\
                            .option("header", "true")\
                                .load("/databricks-datasets/airlines/part-00000")                    


# Sampling

In [0]:
airlines_df.count()

Out[3]: 645918

In [0]:
random_sample_df = airlines_df.sample(fraction=0.1, withReplacement=False, seed=42)
random_sample_df.count()

Out[4]: 64786


# Stratified Sampling

In [0]:
base_df = airlines_df.filter("UniqueCarrier in ('AA', 'DL', 'PS')")
base_df.count()

Out[5]: 147140

In [0]:
base_df.groupBy("UniqueCarrier")\
    .agg(count("UniqueCarrier").alias("Record Counts"))\
        .orderBy("UniqueCarrier")\
        .show()

+-------------+-------------+
|UniqueCarrier|Record Counts|
+-------------+-------------+
|           AA|        56091|
|           DL|        63104|
|           PS|        27945|
+-------------+-------------+



In [0]:
stratified_sample_df = base_df.sampleBy("UniqueCarrier", fractions={"AA":0.18,
                                                                   "DL":0.158},
                                       seed=42)

stratified_sample_df.groupBy("UniqueCarrier")\
    .agg(count("UniqueCarrier").alias("Record Counts"))\
        .orderBy("UniqueCarrier")\
        .show()

+-------------+-------------+
|UniqueCarrier|Record Counts|
+-------------+-------------+
|           AA|        10079|
|           DL|         9886|
+-------------+-------------+




# Splitting

In [0]:
(df1, df2, df3) = airlines_df.randomSplit(weights=[0.25, 0.5, 0.25],
                                          seed=42)

In [0]:
for i in (df1, df2, df3):
    print(i.count())

161848
322752
161318



# Combining

In [0]:
df4 = df1.union(df3) # same as union all in sql

df4.count()

Out[29]: 323166