<a href="https://colab.research.google.com/github/birusolankar/Pyspark-Bigdata/blob/main/PySpark_16_11_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pyspark

In [9]:
from pyspark.sql import functions as f
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName('sample').getOrCreate()

In [13]:
# help(spark.createDataFrame)

In [14]:
# creating a dataframe from the scratch

data = [(1, 'Biru', 'data analyst'),
        (2, 'Laxmi', 'Revenue officer'),
        (3, 'Anuj', 'Data Engineer'),
        (4, 'Shubham', 'Cloud Engineer'),
        (5, 'Manali', 'Security analsy')]

df = spark.createDataFrame(data)

In [25]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [18]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- role: string (nullable = true)



In [15]:
df.show()

+---+-------+---------------+
| _1|     _2|             _3|
+---+-------+---------------+
|  1|   Biru|   data analyst|
|  2|  Laxmi|Revenue officer|
|  3|   Anuj|  Data Engineer|
|  4|Shubham| Cloud Engineer|
|  5| Manali|Security analsy|
+---+-------+---------------+



In [16]:
df = spark.createDataFrame(data = data, schema = ['id', 'name', 'role'])

In [17]:
df.show()

+---+-------+---------------+
| id|   name|           role|
+---+-------+---------------+
|  1|   Biru|   data analyst|
|  2|  Laxmi|Revenue officer|
|  3|   Anuj|  Data Engineer|
|  4|Shubham| Cloud Engineer|
|  5| Manali|Security analsy|
+---+-------+---------------+



In [28]:

data = [(1, 'Biru', 'data analyst'),
        (2, 'Laxmi', 'Revenue officer'),
        (3, 'Anuj', 'Data Engineer'),
        (4, 'Shubham', 'Cloud Engineer'),
        (5, 'Manali', 'Security analsy')]

schema = StructType([
    StructField(name = 'id', dataType = IntegerType()),
    StructField(name = 'name', dataType =  StringType()),
    StructField(name = 'role', dataType = StringType())
])

df = spark.createDataFrame(data = data, schema = schema)

In [29]:
df.show()

+---+-------+---------------+
| id|   name|           role|
+---+-------+---------------+
|  1|   Biru|   data analyst|
|  2|  Laxmi|Revenue officer|
|  3|   Anuj|  Data Engineer|
|  4|Shubham| Cloud Engineer|
|  5| Manali|Security analsy|
+---+-------+---------------+



In [46]:
from pyspark.sql.functions import split
from pyspark.sql.functions import regexp_replace

In [47]:
df.withColumn('new_role', regexp_replace(col('role'), ' ', '')).show()

+---+-------+---------------+--------------+
| id|   name|           role|      new_role|
+---+-------+---------------+--------------+
|  1|   Biru|   data analyst|   dataanalyst|
|  2|  Laxmi|Revenue officer|Revenueofficer|
|  3|   Anuj|  Data Engineer|  DataEngineer|
|  4|Shubham| Cloud Engineer| CloudEngineer|
|  5| Manali|Security analsy|Securityanalsy|
+---+-------+---------------+--------------+



In [48]:
df.withColumn('new_role', split(col('role'), ' ')).show(truncate = False)

+---+-------+---------------+------------------+
|id |name   |role           |new_role          |
+---+-------+---------------+------------------+
|1  |Biru   |data analyst   |[data, analyst]   |
|2  |Laxmi  |Revenue officer|[Revenue, officer]|
|3  |Anuj   |Data Engineer  |[Data, Engineer]  |
|4  |Shubham|Cloud Engineer |[Cloud, Engineer] |
|5  |Manali |Security analsy|[Security, analsy]|
+---+-------+---------------+------------------+



In [65]:
for row in df.select(col('name')).collect():
  print(row['name'].lower())

biru
laxmi
anuj
shubham
manali


In [57]:
df.select(col('name')).collect()

[Row(name='Biru'),
 Row(name='Laxmi'),
 Row(name='Anuj'),
 Row(name='Shubham'),
 Row(name='Manali')]

In [62]:
df.collect()

[Row(id=1, name='Biru', role='data analyst'),
 Row(id=2, name='Laxmi', role='Revenue officer'),
 Row(id=3, name='Anuj', role='Data Engineer'),
 Row(id=4, name='Shubham', role='Cloud Engineer'),
 Row(id=5, name='Manali', role='Security analsy')]

In [63]:
name = 'biru'
name.upper()

'BIRU'