In [1]:
import os
import sys
import pyspark 

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [47]:
#Implement a PySpark script that applies transformations like filter and withColumn on a DataFrame
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col

sp = SparkSession.builder.appName('dataframe_transforms').getOrCreate()
#header = False by default, does not read column names, if schema is not referred then it taks every col dtype as string by default
df = sp.read.csv('trial.csv', header = True, inferSchema = True)
df.show()

#filter operation 
df.filter(df.years == 0).show()

#withColumn operation
df = df.withColumnRenamed('years', 'years served sentence')
df.withColumn("years served sentence", col("years served sentence") + 10).show()

#groupBy
df.groupBy('years served sentence').count().show()


+----+-----+-----+-----+
|days|weeks|years|names|
+----+-----+-----+-----+
|   1|    5|    0|   aa|
|   2|    6|    0|   bb|
|   3|    4|    0|   ab|
|   4|    3|    0|   ba|
|   5|    8|    1|   cc|
|   6|    9|    1|   dd|
+----+-----+-----+-----+

+----+-----+-----+-----+
|days|weeks|years|names|
+----+-----+-----+-----+
|   1|    5|    0|   aa|
|   2|    6|    0|   bb|
|   3|    4|    0|   ab|
|   4|    3|    0|   ba|
+----+-----+-----+-----+

+----+-----+---------------------+-----+
|days|weeks|years served sentence|names|
+----+-----+---------------------+-----+
|   1|    5|                   10|   aa|
|   2|    6|                   10|   bb|
|   3|    4|                   10|   ab|
|   4|    3|                   10|   ba|
|   5|    8|                   11|   cc|
|   6|    9|                   11|   dd|
+----+-----+---------------------+-----+

+---------------------+-----+
|years served sentence|count|
+---------------------+-----+
|                    1|    2|
|                

In [31]:
#2 Write a PySpark script that performs actions like count and show on a DataFrame.
print("The length of the dataframe is", df.count())
df_mod = df.filter(df.names.startswith('a'))
print('The number of entries with the name starting with "a" is: ' + str(df_mod.count()) + ' entries')
df_mod.show()


The length of the dataframe is 6
The number of entries with the name starting with "a" is: 2 entries
+----+-----+---------------------+-----+
|days|weeks|years served sentence|names|
+----+-----+---------------------+-----+
|   1|    5|                    0|   aa|
|   3|    4|                    0|   ab|
+----+-----+---------------------+-----+



In [42]:
#3) Demonstrate how to perform basic aggregations (e.g., sum, average) on a PySpark DataFrame.
from pyspark.sql.functions import sum as sum1
from pyspark.sql.functions import avg 
from pyspark.sql.functions import max as max1

print("The maximum number of weeks a prisoner has served is:")
df.select(max1("weeks")).show()
print("The average number of years served by prisoners is:")
df.select(avg("years served sentence")).show()
print("The sum of the total years served in prison is: ")
df.select(sum1("years served sentence")).show()

The maximum number of weeks a prisoner has served is:
+----------+
|max(weeks)|
+----------+
|         9|
+----------+

The average number of years served by prisoners is:
+--------------------------+
|avg(years served sentence)|
+--------------------------+
|        0.3333333333333333|
+--------------------------+

The sum of the total years served in prison is: 
+--------------------------+
|sum(years served sentence)|
+--------------------------+
|                         2|
+--------------------------+



In [48]:
#4 Show how to write a PySpark DataFrame to a CSV file.
df.write.options(header = 'True').mode('overwrite').csv('updated_data.csv')
sp.stop()

In [63]:
#5 Implement wordcount program in PySpark.
from pyspark.sql.functions import explode 
from pyspark.sql.functions import split 
from pyspark.sql.functions import col

#create dataframe with word count per word
ss = SparkSession.builder.appName('wordcount_program').getOrCreate()
df = ss.read.text('lab2wc.txt')
df.show()
df_count = (df.withColumn('word', explode(split(col('value'), ' '))).groupBy('word').count().sort('count', ascending = False))
df_count.show()

#display total count
print('The total word count of the text file is')
df_count.select(sum1('count')).show()


+--------------------+
|               value|
+--------------------+
|Transformations i...|
|DataFrame from an...|
|meaning they are ...|
|Some common trans...|
+--------------------+

+---------------+-----+
|           word|count|
+---------------+-----+
|            are|    3|
|             in|    2|
|             is|    2|
|        PySpark|    2|
|             an|    2|
|        include|    1|
|     evaluation|    1|
|           lazy|    1|
|            not|    1|
|         blocks|    1|
|            new|    1|
|          their|    1|
|         action|    1|
|           they|    1|
|       building|    1|
|      Resilient|    1|
|Transformations|    1|
|           one.|    1|
|            but|    1|
|      paradigm,|    1|
+---------------+-----+
only showing top 20 rows

The total word count of the text file is
+----------+
|sum(count)|
+----------+
|        51|
+----------+

