In [3]:
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
df = spark.read.option("header",True).csv("data/Programme_Wise_Graduated_Student_Religion_Stats-21-22.csv")
#df.printSchema()
df.show()

+--------------------+---------+----+------+------+-----+
|           Programme| Religion|Male|Female|Others|Total|
+--------------------+---------+----+------+------+-----+
|MSc in Computer S...|    Hindu|   5|     0|     0|    5|
|                null|   Others|   1|     0|     0|    1|
|      MSc in Physics|    Hindu|  22|     8|     0|   30|
|                null|Not Known|   1|     0|     0|    1|
|    MA / MSc in Yoga|    Hindu|   5|     0|     0|    5|
|Integrated 5 Year...|    Hindu|  17|     0|     0|   17|
|MSc in Big Data A...|    Hindu|  12|     0|     0|   12|
|                null|Not Known|   2|     0|     0|    2|
|  MSc in Mathematics|    Hindu|  19|     0|     0|   19|
|                null|Not Known|   3|     0|     0|    3|
|Integrated MPhil-...|    Hindu|   8|     0|     0|    8|
|         PhD in Yoga|    Hindu|   2|     0|     0|    2|
|      PhD in Physics|    Hindu|   1|     0|     0|    1|
|  PG Diploma in Yoga|    Hindu|   2|     0|     0|    2|
|             

In [4]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
df2 = df.withColumn("total_new",col("Total").cast(IntegerType()))

In [5]:
df2.groupBy("programme").sum("total_new").show(truncate=False)

+---------------------------------------------------------+--------------+
|programme                                                |sum(total_new)|
+---------------------------------------------------------+--------------+
|Total                                                    |494           |
|MSc in Agriculture and Rural Development                 |23            |
|PG Diploma in Post Harvest & Food Technology             |19            |
|Diploma in Organic Farming                               |3             |
|B.Ed in Special Education (HI)                           |4             |
|MSc in Physics                                           |30            |
|BSc in Physical Education, Health Education and Sports   |50            |
|PhD in Medical Biotechnology                             |1             |
|null                                                     |85            |
|Intd B.Ed-M.Ed Special Education(ID)                     |7             |
|PG Diploma in Yoga      

In [6]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
acc = spark.sparkContext.accumulator(0)
#spark.udf.register("recreate_table", repeatName, StringType())

@udf(returnType=StringType()) 
def repeatName(str):
    global acc
    #return str
    print (str)
    if (not str):
        return acc
    else:
        acc = str
        return acc

df3 = df2.withColumn("Name", repeatName(col("Programme")))
df3.show(truncate=False)

+-----------------------------------------------------------+---------+----+------+------+-----+---------+-----------------------------------------------------------+
|Programme                                                  |Religion |Male|Female|Others|Total|total_new|Name                                                       |
+-----------------------------------------------------------+---------+----+------+------+-----+---------+-----------------------------------------------------------+
|MSc in Computer Science                                    |Hindu    |5   |0     |0     |5    |5        |MSc in Computer Science                                    |
|null                                                       |Others   |1   |0     |0     |1    |1        |MSc in Computer Science                                    |
|MSc in Physics                                             |Hindu    |22  |8     |0     |30   |30       |MSc in Physics                                             

In [7]:
df4 = df3.select("Name", "Religion", "Total")
df4.show(truncate=False)

+-----------------------------------------------------------+---------+-----+
|Name                                                       |Religion |Total|
+-----------------------------------------------------------+---------+-----+
|MSc in Computer Science                                    |Hindu    |5    |
|MSc in Computer Science                                    |Others   |1    |
|MSc in Physics                                             |Hindu    |30   |
|MSc in Physics                                             |Not Known|1    |
|MA / MSc in Yoga                                           |Hindu    |5    |
|Integrated 5 Year MA in Sanskrit(M)                        |Hindu    |17   |
|MSc in Big Data Analytics                                  |Hindu    |12   |
|MSc in Big Data Analytics                                  |Not Known|2    |
|MSc in Mathematics                                         |Hindu    |19   |
|MSc in Mathematics                                         |Not

In [8]:
df_dept = spark.read.option("header",True).csv("data/Programme_List.csv")
df_dept.show()

+--------------------+-------------+-------------+-------------+------------+----------+--------+--------------+--------------+
|           Programme|         Code|Duration (Yr)|Maximum Marks|      Campus|Department|  Degree|Faculty Center|No of Students|
+--------------------+-------------+-------------+-------------+------------+----------+--------+--------------+--------------+
|Integrated 5 Year...|      SAN-MA5|        5 - 6|          100|Belur Campus|      SANS|5YrIntMA|          null|            73|
|      MSc in Physics|         PHY1|        2 - 2|          100|Belur Campus|      PHYS|     MSC|          null|            63|
|MSc in Big Data A...|          BDA|        2 - 3|          100|Belur Campus|    COMPSC|     MSC|          null|            37|
|MSc in Computer S...|          CS1|        2 - 3|          100|Belur Campus|    COMPSC|     MSC|          null|            24|
|Integrated MPhil-...|SANS-Intd.PHD|       5 - 10|          100|Belur Campus|      SANS|     PhD|       

In [9]:
df_dept_stat = df4.join(df_dept,df4.Name ==  df_dept.Programme,"inner")

In [10]:
df_data = df_dept_stat.select("Department", "Programme", "Religion","Total")
df_data.show()

+----------+--------------------+---------+-----+
|Department|           Programme| Religion|Total|
+----------+--------------------+---------+-----+
|      SANS|Integrated 5 Year...|    Hindu|   17|
|      PHYS|      MSc in Physics|Not Known|    1|
|      PHYS|      MSc in Physics|    Hindu|   30|
|    COMPSC|MSc in Big Data A...|Not Known|    2|
|    COMPSC|MSc in Big Data A...|    Hindu|   12|
|    COMPSC|MSc in Computer S...|   Others|    1|
|    COMPSC|MSc in Computer S...|    Hindu|    5|
|      SANS|Integrated MPhil-...|    Hindu|    8|
|    SSYOGA|  PG Diploma in Yoga|Not Known|    1|
|    SSYOGA|  PG Diploma in Yoga|    Hindu|    2|
|    SSYOGA|         PhD in Yoga|    Hindu|    2|
|    SSYOGA|    MA / MSc in Yoga|    Hindu|    5|
|     MATHS|  MSc in Mathematics|Not Known|    3|
|     MATHS|  MSc in Mathematics|    Hindu|   19|
|      PHYS|      PhD in Physics|    Hindu|    1|
|    SSYOGA|Certificate Cours...|Not Known|    2|
|      SANS|Certificate Cours...|     null| null|


In [11]:
df_data2 = df_data.withColumn("total_new",col("Total").cast(IntegerType()))
df_data2.coalesce(1).write.option("header",True).csv('out/21-22/rel/tmp')
df_data3 = spark.read.option("header",True).csv("out/21-22/rel/tmp/*.csv")

#spark.udf.register("recreate_table", repeatName, StringType())
df_trimmed = df_data3.withColumn("total",col("total_new").cast(IntegerType()))
df_trimmed.show()

+----------+--------------------+---------+-----+---------+
|Department|           Programme| Religion|total|total_new|
+----------+--------------------+---------+-----+---------+
|      SANS|Integrated 5 Year...|    Hindu|   17|       17|
|      PHYS|      MSc in Physics|Not Known|    1|        1|
|      PHYS|      MSc in Physics|    Hindu|   30|       30|
|    COMPSC|MSc in Big Data A...|Not Known|    2|        2|
|    COMPSC|MSc in Big Data A...|    Hindu|   12|       12|
|    COMPSC|MSc in Computer S...|   Others|    1|        1|
|    COMPSC|MSc in Computer S...|    Hindu|    5|        5|
|      SANS|Integrated MPhil-...|    Hindu|    8|        8|
|    SSYOGA|  PG Diploma in Yoga|Not Known|    1|        1|
|    SSYOGA|  PG Diploma in Yoga|    Hindu|    2|        2|
|    SSYOGA|         PhD in Yoga|    Hindu|    2|        2|
|    SSYOGA|    MA / MSc in Yoga|    Hindu|    5|        5|
|     MATHS|  MSc in Mathematics|Not Known|    3|        3|
|     MATHS|  MSc in Mathematics|    Hin

In [13]:
df_Hindu_data = df_trimmed.filter(df_trimmed.Religion == "Hindu")
df_Others_data = df_trimmed.filter(df_trimmed.Religion == "Others")
df_NK_data = df_trimmed.filter(df_trimmed.Religion == "Not Known")


df_Hindu_data.show(truncate=False)


+----------+-----------------------------------------------------------+--------+-----+---------+
|Department|Programme                                                  |Religion|total|total_new|
+----------+-----------------------------------------------------------+--------+-----+---------+
|SANS      |Integrated 5 Year MA in Sanskrit(M)                        |Hindu   |17   |17       |
|PHYS      |MSc in Physics                                             |Hindu   |30   |30       |
|COMPSC    |MSc in Big Data Analytics                                  |Hindu   |12   |12       |
|COMPSC    |MSc in Computer Science                                    |Hindu   |5    |5        |
|SANS      |Integrated MPhil-PhD in Sanskrit                           |Hindu   |8    |8        |
|SSYOGA    |PG Diploma in Yoga                                         |Hindu   |2    |2        |
|SSYOGA    |PhD in Yoga                                                |Hindu   |2    |2        |
|SSYOGA    |MA / MSc

In [15]:
df_Hindu = df_Hindu_data.groupBy("Department").sum("total")
df_Others = df_Others_data.groupBy("Department").sum("total")
df_Nk = df_NK_data.groupBy("Department").sum("total")

df_Hindu.show(truncate=False)

+----------+----------+
|Department|sum(total)|
+----------+----------+
|COMPSC    |17        |
|MATHS     |19        |
|PHYS      |31        |
|SSYOGA    |9         |
|SANS      |31        |
+----------+----------+



In [16]:
df_Hindu.coalesce(1).write.csv('out/21-22/rel/Hindu.csv')

In [17]:
df_Others.coalesce(1).write.csv('out/21-22/rel/Others.csv')
df_Nk.coalesce(1).write.csv('out/21-22/rel/Nk.csv')