In [0]:
print(spark)

In [0]:
data = [("Anil Kumar", 29), ("Dinesh Vimal", 36), ("Vinay Varma", 42), ("Anantha chary", 48)]
df = spark.createDataFrame(data, ["Name","Age"])
df.select("Name").show()

In [0]:
from pyspark.sql import Row
data = [Row(Name="Anil Kumar", Age=29), Row(Name="Dinesh Vimal", Age=36), Row(Name="Vinay Varma", Age=42), Row(Name="Anantha chary", Age=48)]
df = spark.createDataFrame(data, ["Name","Age"])
df.select("Name").show()

In [0]:
from pyspark.sql.functions import col

data = [("Anil Kumar",29), ("Dinesh Vimal", 36), ("Vinay Varma",42), ("Anantha chary", 48)]
schema = ["Name","Age"]
df = spark.createDataFrame(data=data, schema=schema)

#df.show()

#df.select(col("Name")).show()

#df.select(*df.columns).show()

df.select("*").show()

In [0]:
data = [
    ("Anil Kumar", 29, "M", "India", "Engineer", 50000, "Single", "Bangalore", "o+", "Cricket", 5.8, 70, "No", "None"),
    ("Dinesh Vimal", 36, "M", "India", "Manager", 150000, "Married", "Chennai", "B+", "Football", 5.9, 75, "No", "None"),
    ("Vinay Varma", 42, "M", "India", "Director", 150000, "Married", "Hyderabad", "O+", "Tennis", 6.0, 80, "Yes", "None"),
    ("Anantha chary", 48, "M", "India", "VP", 180000, "Married", "Delhi", "AB+", "Golf", 5.7, 78, "Yes", "Hypertension")
]
schema = [
    "Name", "Age", "Gender", "Country", "Occupation", "Salary", "MaritalStatus", "City", "BloodGroup", "Hobby",
    "Height", "Weight", "Smoker", "MedicalCondition"
]
df = spark.createDataFrame(data=data, schema=schema)
#display(df.select(df.columns[:3]))
#display(df.select(df.columns[0:3]))
#display(df.limit(3))
display(df.select(df.columns[:3])).display(df.limit(3))
#display(df.select("Name").orderBy("Name"))
#display(df.select("*").orderBy("Name"))

In [0]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

from pyspark import SparkFiles

spark.sparkContext.addFile(url)
df = spark.read.csv("file://"+SparkFiles.get("iris.data"), header=False, inferSchema=True)
df.show()

In [0]:
display(dbutils.fs.ls("/FileStore/tables/"))

In [0]:
files = [f.path for f in dbutils.fs.ls("/FileStore/tables/") if f.path.endswith('.csv')]
df_multi = spark.read.csv(files, header=True, inferSchema=True)
display(df_multi)

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("cresol.in").getOrCreate()

address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]

df=spark.createDataFrame(address,['Id','Address','State'])
#display(df)

from pyspark.sql.functions import regexp_replace

df = df.withColumn('Address', regexp_replace('Address', 'Rd', 'Road'))
#display(df)

from pyspark.sql.functions import lit

df = df.withColumn('City', lit('CE'))
display(df)

In [0]:
from pyspark.sql import Row

data = [
    Row(name="Alice", age=28, gender="F", salary=50000),
    Row(name="Bob", age=35, gender="M", salary=45000),
    Row(name="Charlie", age=22, gender="M", salary=38000),
    Row(name="Diana", age=30, gender="F", salary=None),
    Row(name="Eve", age=None, gender="F", salary=42000),
    Row(name="Frank", age=24, gender="M", salary=39000),
    Row(name="Grace", age=27, gender="F", salary=41000)
]

df = spark.createDataFrame(data)

# 1. Equal / Not equal
#df = df.filter(col("gender") == "M")
#df = df.filter(col("gender") != "M")
#display(df)

# 2. Greater than / Less than
#df.filter(col("age") > 30).show()
#df.filter(col("salary") < 50000).show()
 
 
# 3. Greater than or equal / Less than or equal
#df.filter(col("age") >= 25).show()
#df.filter(col("salary") <= 40000).show()
 
# 4. IN / NOT IN
#df.filter(col("name").isin("Alice", "Bob")).show()
#df.filter(~col("name").isin("Charlie", "Diana")).show()
 
# 5. BETWEEN
#df.filter(col("age").between(25, 30)).show()
 
# 6. NULL / NOT NULL
#df.filter(col("age").isNull()).show()
#df.filter(col("salary").isNotNull()).show()
 
 
# 7. Startswith / Endswith / Contains
#df.filter(col("name").startswith("A")).show()
#df.filter(col("name").endswith("a")).show()
#df.filter(col("name").contains("ar")).show()
 
# 8. Multiple conditions (AND / OR)
#df.filter((col("gender") == "F") & (col("age") > 25)).show()
#df.filter((col("age") < 25) | (col("salary") < 40000)).show()



In [0]:
from pyspark import *

#Read the CSV file
df = spark.read.csv("/FileStore/tables/iris_data.csv")
#df.show()

#Rename the columns
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
df = df.toDF(*columns)
#df.show()

# Use Select to pick only specific columns
#df = df.select("sepal_length","sepal_width")
#df = df.select("*")
#df.show()

#Add a new column `sepal_area` using `withColumn`
df = df.withColumn("sepal_area", (col("sepal_length").cast("double") * col("sepal_width").cast("double")))

#Filter rows where class is 'Iris-setosa' and sepal_length > 5
#df = df.filter((col("sepal_length") > '5.0') & (col("class") == 'Iris-setosa'))
#display(df)

#Combine all – filter class 'Iris-virginica' with petal_width > 2, create petal_area, and select
df = df.filter((col("petal_width") > '2.0') & (col("class") == 'Iris-virginica')).withColumn("petal_area", lit("Test"))
display(df)


