In [None]:
import findspark

In [None]:
findspark.init()

In [None]:
findspark.find()

In [None]:
from pyspark.sql import SparkSession

### Spark Session object

In [None]:
spark = SparkSession.builder.appName("Anaconda python").master("local[5]").getOrCreate()

In [None]:
spark

### Create Spark Context from Spark session object

In [None]:
sc = spark.sparkContext

### Create rdd from memory

In [None]:
numbers_rdd = sc.parallelize(range(1,101), 4)

In [None]:
numbers_rdd

## Map partitions with Index

In [None]:
def print_elements_in_partition(index, elements):
    for element in elements:
        yield f"partition: {index} has element: {element}"

In [None]:
numbers_rdd.mapPartitionsWithIndex(print_elements_in_partition).collect()

### Map transfromation

In [None]:
numbers_rdd.map(lambda x: x*x).collect()

### Filter transformation

In [None]:
numbers_rdd.filter(lambda x : x%2 ==0).collect()

### Get Number of partitions of a rdd

In [None]:
numbers_rdd.getNumPartitions()

### Count Action

In [None]:
numbers_rdd.count()

### flatMap transformation

In [None]:
sentences = ["This is a sentence1.", "This is sentence2.", "This is sentence3"]

In [None]:
sentence_rdd = sc.parallelize(sentences)

In [None]:
sentence_rdd.collect()

In [None]:
sentence_rdd.flatMap(lambda x:x.split()).collect()

#### Get the number of rdd partitions

In [None]:
rdd = sc.textFile("/Users/avinashs/Downloads/pyspark/datasets/Baby_Names__Beginning_2007.csv")

In [None]:
rdd

In [None]:
rdd.getNumPartitions()

In [None]:
rdd.collect()

### Count the number of elements in a rdd

In [None]:
rdd.count()

### Mappartitions index transformation

In [None]:
def count_baby_names_partitions_wise(index, iterator):
    count_records = 0
    for row in iterator:
        count_records += 1
    yield f"Parition {index} has {count_records} records"

In [None]:
rdd.mapPartitionsWithIndex(count_baby_names_partitions_wise).collect()

In [None]:
def skip_header(index, iterator):
    number = 0
    for row in iterator:
        if index == 0 and number == 0:
            number += 1
            continue
        yield row
            
    
            

In [None]:
data_rdd = rdd.mapPartitionsWithIndex(skip_header)

In [None]:
year_rdd = data_rdd.map(lambda x: (x.split(",")[0], int(x.split(",")[-1])))

### reducebykey and groupbykey transformations

In [None]:
year_rdd.reduceByKey(lambda x, y : x+y).collect()

In [None]:
year_rdd.groupByKey().mapValues(sum).collect()

In [None]:
year_rdd.getNumPartitions()

### Increase the number of partition

In [None]:
year_rdd = year_rdd.repartition(4)

In [None]:
year_rdd.getNumPartitions()

### Decrease the number of partition

In [None]:
year_rdd.coalesce(3).getNumPartitions()

### Write rdd to a text file

In [None]:
year_rdd.saveAsTextFile("years.txt")

In [None]:
rdd_repartition = rdd.repartition(4)

In [None]:
rdd_repartition.collect()

In [None]:
file_rdd = sc.textFile("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/ebook.txt", 4)

In [None]:
file_rdd.collect()

In [None]:
file_rdd.flatMap(lambda x: x.split(" ")).map(lambda x : (x,1)).reduceByKey(lambda x,y: x+y).collect()

In [None]:
filename = "/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/weather.csv"
file1_rdd = sc.textFile(filename)

### take action

In [None]:
file1_rdd.take(5)

In [None]:
file1_rdd.first()

In [None]:
file1_rdd.map(lambda x: (x.split(",")[0], int(x.split(",")[-1]))).map(lambda x: (int(x[0].split("-")[0]), x[1])).reduceByKey(lambda x, y: max(x,y)).sortByKey().collect()

### Accumulators

In [None]:
accum = sc.accumulator(0)

In [None]:
file_rdd = sc.textFile("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/ebook.txt")

In [None]:
file_rdd.getNumPartitions()

In [None]:
def count_blank_lines_in_partition(iterator):
    count_blank_lines = 0
    for row in iterator:
        if len(row.strip()) == 0:
            count_blank_lines += 1

    accum.add(count_blank_lines)

In [None]:
file_rdd.foreachPartition(count_blank_lines_in_partition)

In [None]:
accum.value

### read json file in python

In [None]:
import json
from pprint import pprint

filename = "/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/people.json"
texts = []
for line in open(filename, 'r'):
    texts.append(json.loads(line))

In [None]:
json_rdd = sc.textFile(filename)

In [None]:
def convert_to_csv(record):
    return tuple(json.loads(record).values())

In [None]:
json_rdd.map(convert_to_csv).saveAsTextFile("json_csv")

### Store the rdd in memory or in disk

In [None]:
rdd = sc.textFile("/Users/avinashs/PycharmProjects/introtoPython/spark/datasets/Baby_Names__Beginning_2007.csv")

In [None]:
rdd.take(5)

In [None]:
first_rdd = rdd.map(lambda x : (x.split(",")[0], x.split(",")[1]))

In [None]:
second_rdd = rdd.map(lambda x: (x.split(",")[1], x.split(",")[-2]))

In [None]:
rdd.persist()

In [None]:
first_rdd.collect()

In [None]:
second_rdd.collect()

In [None]:
rdd = rdd.unpersist()

In [None]:
rdd = rdd.unpersist(blocking=False)

In [None]:
rdd.collect()

In [None]:
help(rdd.cache)

In [None]:
help(rdd.persist)

In [None]:
rdd.is_cached

In [None]:
rdd.unpersist()

### To see all the spark configuration variables

In [None]:
sc._conf.getAll()

### foreachPartition action

In [None]:
rdd.foreachPartition(print)

### stop the sparksession

In [None]:
spark.stop()