# Quick Example 
https://flicker.perfectlyrandom.org/quick-example/

In [47]:
from pyspark.sql import SparkSession
from flicker import FlickerDataFrame

In [48]:
# Start Spark
spark = SparkSession.builder.appName('PySparkShell').getOrCreate()
pyspark_df = spark.createDataFrame(
    [(1, 'Turing', 41), (2, 'Laplace', 77), (3, 'Kolmogorov', 84)],
    'id INT, name STRING, age INT')

### Create the DataFrame

In [49]:
df = FlickerDataFrame(pyspark_df)

### Print the DataFrame

In [50]:
df

FlickerDataFrame[id: int, name: string, age: int]

In [51]:
pyspark_df

DataFrame[id: int, name: string, age: int]

In [52]:
df()

Unnamed: 0,id,name,age
0,1,Turing,41
1,2,Laplace,77
2,3,Kolmogorov,84


In [53]:
pyspark_df.show()

+---+----------+---+
| id|      name|age|
+---+----------+---+
|  1|    Turing| 41|
|  2|   Laplace| 77|
|  3|Kolmogorov| 84|
+---+----------+---+



In [54]:
pandas_df_sample = df()
pandas_df_sample['name'].values

array(['Turing', 'Laplace', 'Kolmogorov'], dtype=object)

In [55]:
pandas_df_sample = pyspark_df.limit(5).toPandas()
pandas_df_sample['name'].values

array(['Turing', 'Laplace', 'Kolmogorov'], dtype=object)

### Inspect shape and columns

In [56]:
df.shape

(3, 3)

In [57]:
(pyspark_df.count(), len(pyspark_df.columns))

(3, 3)

In [58]:
df.names

['id', 'name', 'age']

In [59]:
df.dtypes

[('id', 'int'), ('name', 'string'), ('age', 'int')]

In [60]:
pyspark_df.columns

['id', 'name', 'age']

In [61]:
pyspark_df.dtypes

[('id', 'int'), ('name', 'string'), ('age', 'int')]

### Extracting a column

In [62]:
df['name']  # not a FickerDataFrame object

Column<b'name'>

In [63]:
pyspark_df['name']  # not a pyspark.sql.DataFrame object

Column<b'name'>

In [64]:
df[['name']].distinct().nrows

3

In [65]:
pyspark_df[['name']].distinct().count()

3

### Extracting multiple columns

In [66]:
df[['name', 'age']]

FlickerDataFrame[name: string, age: int]

In [67]:
pyspark_df[['name', 'age']]

DataFrame[name: string, age: int]

### Creating a new column

In [68]:
df['is_age_more_than_fifty'] = df['age'] > 50
df()

Unnamed: 0,id,name,age,is_age_more_than_fifty
0,1,Turing,41,False
1,2,Laplace,77,True
2,3,Kolmogorov,84,True


In [69]:
pyspark_df = pyspark_df.withColumn('is_age_more_than_fifty', pyspark_df['age'] > 50)
pyspark_df.show()

+---+----------+---+----------------------+
| id|      name|age|is_age_more_than_fifty|
+---+----------+---+----------------------+
|  1|    Turing| 41|                 false|
|  2|   Laplace| 77|                  true|
|  3|Kolmogorov| 84|                  true|
+---+----------+---+----------------------+



### Filtering

In [70]:
# Use boolean column to filter
df[df['is_age_more_than_fifty']]

FlickerDataFrame[id: int, name: string, age: int, is_age_more_than_fifty: boolean]

In [71]:
# Filter and print in one-line
df[df['age'] < 50]()

Unnamed: 0,id,name,age,is_age_more_than_fifty
0,1,Turing,41,False


In [72]:
# Use boolean column to filter
pyspark_df[pyspark_df['is_age_more_than_fifty']]

DataFrame[id: int, name: string, age: int, is_age_more_than_fifty: boolean]

In [73]:
# Filter and print in one-line
pyspark_df[pyspark_df['age'] < 50].show()

+---+------+---+----------------------+
| id|  name|age|is_age_more_than_fifty|
+---+------+---+----------------------+
|  1|Turing| 41|                 false|
+---+------+---+----------------------+



### Common operations

In [74]:
df.value_counts('name')

FlickerDataFrame[name: string, count: bigint]

In [75]:
df.value_counts('name')()

Unnamed: 0,name,count
0,Laplace,1
1,Turing,1
2,Kolmogorov,1


In [76]:
pyspark_df.groupby('name').count()

DataFrame[name: string, count: bigint]

In [77]:
pyspark_df.groupby('name').count().show()

+----------+-----+
|      name|count|
+----------+-----+
|    Turing|    1|
|Kolmogorov|    1|
|   Laplace|    1|
+----------+-----+



In [78]:
df.value_counts('is_age_more_than_fifty', normalize=True,
                sort=True, ascending=True)()

Unnamed: 0,is_age_more_than_fifty,count
0,False,0.333333
1,True,0.666667


In [79]:
nrows = pyspark_df.count()
count_df = (pyspark_df.groupBy('is_age_more_than_fifty')
            .count()
            .orderBy('count', ascending=True))
count_df.withColumn('count', count_df['count'] / nrows).show()

+----------------------+------------------+
|is_age_more_than_fifty|             count|
+----------------------+------------------+
|                 false|0.3333333333333333|
|                  true|0.6666666666666666|
+----------------------+------------------+



### Chain everything together

In [80]:
df[df['age'] < 50].rows_with_max('age')[['name']]()['name'][0]

'Turing'

In [81]:
filtered_df = pyspark_df[pyspark_df['age'] < 50]
age_max = filtered_df.agg({'age': 'max'}).collect()[0][0]
filtered_df[filtered_df['age'].isin([age_max])][['name']].toPandas()['name'][0]

'Turing'

### Get the PySpark dataframe

In [82]:
pyspark_df = df._df
processed_pyspark_df = df[df['age'] < 50].rows_with_max('age')._df