In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
        .master("local")
        .appName("Colab")
        .config('spark.ui.port', '4050')
        .getOrCreate())

In [6]:
from pyspark.sql import Row, functions as f
import datetime 
import pandas as pd


In [7]:
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

In [8]:
df = spark.createDataFrame(pd.DataFrame(users))

Using `withColumn` to rename the columns.

In [18]:
df.withColumn('CountOfCourse', f.size(f.col('courses'))).show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+-------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|CountOfCourse|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+-------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|            2|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|            1|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|            2|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      fal

In [14]:
df.select(f.col('courses')).show()

+-------+
|courses|
+-------+
| [1, 2]|
|    [3]|
| [2, 4]|
|     []|
|     []|
+-------+



Using `withColumnRenamed`

In [19]:
help(df.withColumnRenamed)

Help on method withColumnRenamed in module pyspark.sql.dataframe:

withColumnRenamed(existing, new) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by renaming an existing column.
    This is a no-op if schema doesn't contain the given column name.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    existing : str
        string, name of the existing column to rename.
    new : str
        string, new name of the column.
    
    Examples
    --------
    >>> df.withColumnRenamed('age', 'age2').collect()
    [Row(age2=2, name='Alice'), Row(age2=5, name='Bob')]



In [21]:
(df.withColumnRenamed("id", "user_id")
    .withColumnRenamed("first_name", "user_first_name")
    .withColumnRenamed("last_name", "user_last_name")
).show()


+-------+---------------+--------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|user_id|user_first_name|user_last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+-------+---------------+--------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|      1|         Corrie|  Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|      2|       Nikolaus|       Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|      3|         Orelie|        Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|      4|          Ashby|      Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|    

Renaming spark data frame columns or expression using alias

In [25]:
(df.select(f.col('id').alias('user_id')
        , f.col('first_name').alias('user_first_name')
        , f.col('last_name').alias('user_last_name')
        , f.concat(f.col('first_name'), f.lit(', '), f.col('last_name')).alias('fullName')
        )
).show()

+-------+---------------+--------------+--------------------+
|user_id|user_first_name|user_last_name|            fullName|
+-------+---------------+--------------+--------------------+
|      1|         Corrie|  Van den Oord|Corrie, Van den Oord|
|      2|       Nikolaus|       Brewitt|   Nikolaus, Brewitt|
|      3|         Orelie|        Penney|      Orelie, Penney|
|      4|          Ashby|      Maddocks|     Ashby, Maddocks|
|      5|           Kurt|          Rome|          Kurt, Rome|
+-------+---------------+--------------+--------------------+



In [34]:
#combination of withColumn and alias

(df.select(f.col('id').alias('user_id')
              , f.col('first_name').alias('user_first_name')
              , f.col('last_name').alias('user_last_name')
              )
      .withColumn('fullNmae', f.concat(f.col('user_first_name'), f.lit(', '),  f.col('user_last_name')))
).show()

+-------+---------------+--------------+--------------------+
|user_id|user_first_name|user_last_name|            fullNmae|
+-------+---------------+--------------+--------------------+
|      1|         Corrie|  Van den Oord|Corrie, Van den Oord|
|      2|       Nikolaus|       Brewitt|   Nikolaus, Brewitt|
|      3|         Orelie|        Penney|      Orelie, Penney|
|      4|          Ashby|      Maddocks|     Ashby, Maddocks|
|      5|           Kurt|          Rome|          Kurt, Rome|
+-------+---------------+--------------+--------------------+



Renaming columns and reording columns in data frame.

In [35]:
required_cols = ["id", 'first_name', 'last_name', 'email', 'phone_numbers', 'courses']
target_column_names = ['user_id', 'user_first_name', 'user_last_name', 'user_email', 'user_phone_numbers', 'enrolled_courses']

In [36]:
df.select(required_cols).show()

+---+----------+------------+--------------------+--------------------+-------+
| id|first_name|   last_name|               email|       phone_numbers|courses|
+---+----------+------------+--------------------+--------------------+-------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|
|  5|      Kurt|        Rome|krome4@shutterfly...|{+1 817 934 7142,...|     []|
+---+----------+------------+--------------------+--------------------+-------+



In [38]:
df.select(required_cols).toDF(*target_column_names).show()

+-------+---------------+--------------+--------------------+--------------------+----------------+
|user_id|user_first_name|user_last_name|          user_email|  user_phone_numbers|enrolled_courses|
+-------+---------------+--------------+--------------------+--------------------+----------------+
|      1|         Corrie|  Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...|          [1, 2]|
|      2|       Nikolaus|       Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|             [3]|
|      3|         Orelie|        Penney|openney2@vistapri...|{+1 714 512 9752,...|          [2, 4]|
|      4|          Ashby|      Maddocks|  amaddocks3@home.pl|        {null, null}|              []|
|      5|           Kurt|          Rome|krome4@shutterfly...|{+1 817 934 7142,...|              []|
+-------+---------------+--------------+--------------------+--------------------+----------------+

