In [1]:
%load_ext nb_black
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

<IPython.core.display.Javascript object>

In [2]:
spark = SparkSession.builder.getOrCreate()

<IPython.core.display.Javascript object>

In [3]:
spark

<IPython.core.display.Javascript object>

In [4]:
df = spark.read.csv(
    "../kaggle/fraud-detection/data/train_transaction.csv",
    inferSchema=True,
    header=True,
)

<IPython.core.display.Javascript object>

In [5]:
df.columns[:5]

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD']

<IPython.core.display.Javascript object>

The most simple way to refer to columns is to use the .col function. 

In [6]:
F.col("TransactionID")

Column<b'TransactionID'>

<IPython.core.display.Javascript object>

Another way to refer to a column is by using expressions. Keep in mind that columns are expressions.

In [7]:
F.expr("TransactionID")

Column<b'TransactionID'>

<IPython.core.display.Javascript object>

DataFrame transformations convert one DataFrame into another, while actions perform some computation on a DataFrame and normally return the result to the driver.

Keep in mind that transformations in DataFrames are lazy. Every transformation needs an action in order for Spark to actually start the execution.

#### Selecting columns
The .select transformation allows us to extract one or more columns from a DataFrame. 

In [8]:
df.select("TransactionDT", "TransactionAmt").explain()

== Physical Plan ==
*(1) FileScan csv [TransactionDT#12,TransactionAmt#13] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/radufotolescu/dev/useful/kaggle/fraud-detection/data/train_transact..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<TransactionDT:int,TransactionAmt:double>


<IPython.core.display.Javascript object>

In [9]:
df.select("TransactionDT", "TransactionAmt").show(3)

+-------------+--------------+
|TransactionDT|TransactionAmt|
+-------------+--------------+
|        86400|          68.5|
|        86401|          29.0|
|        86469|          59.0|
+-------------+--------------+
only showing top 3 rows



<IPython.core.display.Javascript object>

#### Renaming columns

In [10]:
df.select(
    F.col("TransactionDT").alias("DateTime"), F.col("TransactionAmt").alias("Amount")
).show(3)

+--------+------+
|DateTime|Amount|
+--------+------+
|   86400|  68.5|
|   86401|  29.0|
|   86469|  59.0|
+--------+------+
only showing top 3 rows



<IPython.core.display.Javascript object>

With a long list of columns you would like to change only few column names. 

This is very useful when joining dataframes with duplicate column names.

In [11]:
df.select("TransactionDT", "TransactionAmt").withColumnRenamed(
    "TransactionDT", "DateTime"
).withColumnRenamed("TransactionAmt", "Amount").show(3)

+--------+------+
|DateTime|Amount|
+--------+------+
|   86400|  68.5|
|   86401|  29.0|
|   86469|  59.0|
+--------+------+
only showing top 3 rows



<IPython.core.display.Javascript object>

#### Transforming columns
The .withColumn transformation applies a function to some other columns and stores it.

In [12]:
df.select("P_emaildomain").withColumn(
    "Pad_emaildomain", F.upper(F.col("P_emaildomain")),
).show(5)

+-------------+---------------+
|P_emaildomain|Pad_emaildomain|
+-------------+---------------+
|         null|           null|
|    gmail.com|      GMAIL.COM|
|  outlook.com|    OUTLOOK.COM|
|    yahoo.com|      YAHOO.COM|
|    gmail.com|      GMAIL.COM|
+-------------+---------------+
only showing top 5 rows



<IPython.core.display.Javascript object>

In [13]:
df.select("P_emaildomain").withColumn("Pad_emaildomain", F.col("P_emaildomain")).drop(
    "P_emaildomain"
).show(5)

+---------------+
|Pad_emaildomain|
+---------------+
|           null|
|      gmail.com|
|    outlook.com|
|      yahoo.com|
|      gmail.com|
+---------------+
only showing top 5 rows



<IPython.core.display.Javascript object>

#### Removing columns

In [14]:
df.select("TransactionDT", "TransactionAmt").drop("TransactionDT").show(3)

+--------------+
|TransactionAmt|
+--------------+
|          68.5|
|          29.0|
|          59.0|
+--------------+
only showing top 3 rows



<IPython.core.display.Javascript object>

### Select & SelectExpr

The .select transformation allows us to extract one or more columns from a DataFrame. 

.select + .expr and .selectExpr allows us to do the DataFrame equivalent of SQL queries.

In [15]:
%load_ext nb_black
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.getOrCreate()

df = spark.read.csv(
    "../kaggle/fraud-detection/data/train_transaction.csv",
    inferSchema=True,
    header=True,
)

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

A way to refer to a column is by using expressions.

In [16]:
df.columns[:5]

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD']

<IPython.core.display.Javascript object>

In [17]:
F.expr("TransactionID")

Column<b'TransactionID'>

<IPython.core.display.Javascript object>

In [18]:
df.select("TransactionID").show(3)

+-------------+
|TransactionID|
+-------------+
|      2987000|
|      2987001|
|      2987002|
+-------------+
only showing top 3 rows



<IPython.core.display.Javascript object>

In [19]:
df.select(F.expr("TransactionID as ID")).show(3)

+-------+
|     ID|
+-------+
|2987000|
|2987001|
|2987002|
+-------+
only showing top 3 rows



<IPython.core.display.Javascript object>

In [20]:
df.select(F.expr("count(distinct(isFraud))")).show(3)

+-----------------------+
|count(DISTINCT isFraud)|
+-----------------------+
|                      2|
+-----------------------+



<IPython.core.display.Javascript object>

In [21]:
df.selectExpr("count(distinct(isFraud))").show(3)

+-----------------------+
|count(DISTINCT isFraud)|
+-----------------------+
|                      2|
+-----------------------+



<IPython.core.display.Javascript object>

In [22]:
df.selectExpr("avg(TransactionAmt)").show(3)

+-------------------+
|avg(TransactionAmt)|
+-------------------+
| 135.02717637250117|
+-------------------+



<IPython.core.display.Javascript object>