In [None]:
import findspark

In [None]:
findspark.init()

In [None]:
findspark.find()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()

### Create spark dataframe from rdd

In [None]:
df = spark.createDataFrame([[1,2,3,4,5],[6,7,8,9,10]])

In [None]:
df.show()

In [None]:
df.collect()

In [None]:
rdd = spark.sparkContext.parallelize([(1, "python"), (2, "java"), (3, "c++")])

In [None]:
rdd.collect()

In [None]:
df1 = rdd.toDF()

In [None]:
df1.show()

### Read json file as spark dataframe

In [None]:
people_df = spark.read.json("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/people.json")

In [None]:
people_df.show()

### Read csv file as spark dataframe

In [None]:
baby_names_df = spark.read.csv("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/Baby_Names__Beginning_2007.csv")

In [None]:
baby_names_df.show()

### Read csv file with header

In [None]:
spark.read.option("header", True).csv("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/Baby_Names__Beginning_2007.csv")

### Standard Dataframe Reader for reading data in spark

In [None]:
spark.read.format("json").load("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/people.json")

In [None]:
spark.read.format("csv").option("header", "True").load("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/Baby_Names__Beginning_2007.csv")

### Explicity creating dataframe with user-defined schema

In [None]:
from pyspark.sql.types import *

In [None]:
columns = [StructField("year", DateType(), True),
                    StructField("first_name", StringType(), True),
                    StructField("county", StringType(), True),
                    StructField("gender", StringType(), True),
                    StructField("count", IntegerType(), True)]

In [None]:
schema= StructType(columns)

In [None]:
csv_schema_df = spark.read.format("csv").option("mode", "DROPMALFORMED").schema(schema).load("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/Baby_Names__Beginning_2007.csv")

In [None]:
csv_schema_df.show()

### Print the schema of the dataframe

In [None]:
csv_schema_df.printSchema()

### Describe the dataframe

In [None]:
csv_schema_df.describe()

In [None]:
csv_schema_df.count()

### Get the first row in a dataframe

In [None]:
csv_schema_df.first()

### Get n records

In [None]:
csv_schema_df.take(5)

### Get 1st n records

In [None]:
csv_schema_df.head(10)

### Get last n records

In [None]:
csv_schema_df.tail(10)

### Create row object manually and create spark dataframe from row object

In [None]:
from pyspark.sql.types import Row

In [None]:
row2 = Row(name="Alice", age=11)

In [None]:
row2

In [None]:
row1 = Row(name="Janice", age=15)

In [None]:
row1.asDict().items()

In [None]:
rows = [row1, row2]

In [None]:
spark.createDataFrame(rows).show()

In [None]:
csv_file = "/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/Baby_Names__Beginning_2007.csv"

In [None]:
rdd = spark.sparkContext.textFile(csv_file)

In [None]:
rdd_header = rdd.first()

In [None]:
rdd_data = rdd.filter(lambda x: x != rdd_header)

In [None]:
rdd_data.count()

In [None]:
rdd_data.take(10)

In [None]:
row_data = rdd_data.map(lambda x: Row(year=int(x.split(",")[0]), 
                           first_name=x.split(",")[1], 
                           country=x.split(",")[2], 
                           gender=x.split(",")[3], 
                           count=int(x.split(",")[4])))

In [None]:
row_data.take(10)

In [None]:
df = spark.createDataFrame(row_data)

In [None]:
first_row = df.first()

In [None]:
first_row

In [None]:
df.printSchema()

### Get all the columns in a spark dataframe

In [None]:
df.columns

### Access a single column using object notation

In [None]:
df.year

### Access a single column using dictionary notation

In [None]:
df["year"]

In [None]:
from pyspark.sql.functions import column, col, expr

### Access columns using col, column, expr functions

In [None]:
df.select("year").show()

In [None]:
df.select(col("year"), column("country").alias("county"), col("count"), expr("first_name")).show()

In [None]:
df.select("year", "country", "gender").show()

In [None]:
df.select("year", df.country, df["count"]).show()

In [None]:
df.selectExpr("year", "country", "gender").show()

In [None]:
df.selectExpr("first_name").show()

### filter transformation to get the needed records

In [None]:
df.filter(df.year == 2013).show()

In [None]:
df.filter(df["year"] == 2013).show()

In [None]:
df.filter("year == 2013").show()

In [None]:
df.filter("year == 2013").select("year", "count").show()

### expr transformation

In [None]:
df.select("first_name").show()

In [None]:
from pyspark.sql.functions import length

In [None]:
df.select(length("first_name").alias("first_name_length")).show()

In [None]:
df.selectExpr("length(first_name)").show()

In [None]:
df.selectExpr("count * 2").show()

### select all columns in a dataframe

In [None]:
df.select(df.columns).show(truncate=False)

### select needed columns in a dataframe

In [None]:
needed_columns = ["year", "first_name", "gender"]
df.select(*needed_columns).show()

In [None]:
spark.stop()