In [124]:
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
df = spark.read.option("header",True).csv("data/Programme_Wise_Graduated_Student_Category_Stats-19-20.csv")
df.printSchema()

root
 |-- Programme: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Male: string (nullable = true)
 |-- Female: string (nullable = true)
 |-- Others: string (nullable = true)
 |-- Total: string (nullable = true)



In [125]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
df2 = df.withColumn("total_new",col("Total").cast(IntegerType()))

In [126]:
df2.groupBy("programme").sum("total_new").show(truncate=False)

+---------------------------------------------------------+--------------+
|programme                                                |sum(total_new)|
+---------------------------------------------------------+--------------+
|Total                                                    |541           |
|MSc in Agriculture and Rural Development                 |15            |
|PG Diploma in Post Harvest & Food Technology             |17            |
|Diploma in Organic Farming                               |3             |
|B.Ed in Special Education (HI)                           |1             |
|MSc in Physics                                           |26            |
|BSc in Physical Education, Health Education and Sports   |1             |
|PhD in Medical Biotechnology                             |1             |
|null                                                     |238           |
|Intd B.Ed-M.Ed Special Education(ID)                     |3             |
|PG Diploma in Yoga      

In [127]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
acc = spark.sparkContext.accumulator(0)
#spark.udf.register("recreate_table", repeatName, StringType())

@udf(returnType=StringType()) 
def repeatName(str):
    global acc
    #return str
    print (str)
    if (not str):
        return acc
    else:
        acc = str
        return acc

df3 = df2.withColumn("Name", repeatName(col("Programme")))
df3.show(truncate=False)

+-----------------------------------+-----------+----+------+------+-----+---------+-----------------------------------+
|Programme                          |Category   |Male|Female|Others|Total|total_new|Name                               |
+-----------------------------------+-----------+----+------+------+-----+---------+-----------------------------------+
|MSc in Computer Science            |GENERAL    |5   |0     |0     |5    |5        |MSc in Computer Science            |
|null                               |GENERAL-EWS|1   |0     |0     |1    |1        |MSc in Computer Science            |
|MSc in Physics                     |GENERAL    |19  |7     |0     |26   |26       |MSc in Physics                     |
|null                               |OBC        |1   |1     |0     |2    |2        |MSc in Physics                     |
|null                               |SC         |3   |0     |0     |3    |3        |MSc in Physics                     |
|MA / MSc in Yoga               

In [128]:
df4 = df3.select("Name", "Category", "Total")
df4.show(truncate=False)

+-----------------------------------+-----------+-----+
|Name                               |Category   |Total|
+-----------------------------------+-----------+-----+
|MSc in Computer Science            |GENERAL    |5    |
|MSc in Computer Science            |GENERAL-EWS|1    |
|MSc in Physics                     |GENERAL    |26   |
|MSc in Physics                     |OBC        |2    |
|MSc in Physics                     |SC         |3    |
|MA / MSc in Yoga                   |GENERAL    |2    |
|MA / MSc in Yoga                   |SC         |2    |
|MA / MSc in Yoga                   |ST         |1    |
|Integrated 5 Year MA in Sanskrit(M)|GENERAL    |9    |
|Integrated 5 Year MA in Sanskrit(M)|OBC        |2    |
|Integrated 5 Year MA in Sanskrit(M)|SC         |6    |
|MSc in Big Data Analytics          |GENERAL    |11   |
|MSc in Big Data Analytics          |GENERAL-EWS|2    |
|MSc in Big Data Analytics          |SC         |1    |
|MSc in Mathematics                 |GENERAL    

In [129]:
df_dept = spark.read.option("header",True).csv("data/Programme_List.csv")
df_dept.show()

+--------------------+-------------+-------------+-------------+------------+----------+--------+--------------+--------------+
|           Programme|         Code|Duration (Yr)|Maximum Marks|      Campus|Department|  Degree|Faculty Center|No of Students|
+--------------------+-------------+-------------+-------------+------------+----------+--------+--------------+--------------+
|Integrated 5 Year...|      SAN-MA5|        5 - 6|          100|Belur Campus|      SANS|5YrIntMA|          null|            73|
|      MSc in Physics|         PHY1|        2 - 2|          100|Belur Campus|      PHYS|     MSC|          null|            63|
|MSc in Big Data A...|          BDA|        2 - 3|          100|Belur Campus|    COMPSC|     MSC|          null|            37|
|MSc in Computer S...|          CS1|        2 - 3|          100|Belur Campus|    COMPSC|     MSC|          null|            24|
|Integrated MPhil-...|SANS-Intd.PHD|       5 - 10|          100|Belur Campus|      SANS|     PhD|       

In [130]:
df_dept_stat = df4.join(df_dept,df4.Name ==  df_dept.Programme,"inner")

In [131]:
df_data = df_dept_stat.select("Department", "Programme", "Category","Total")
df_data.show()

+----------+--------------------+-----------+-----+
|Department|           Programme|   Category|Total|
+----------+--------------------+-----------+-----+
|      SANS|Integrated 5 Year...|         SC|    6|
|      SANS|Integrated 5 Year...|        OBC|    2|
|      SANS|Integrated 5 Year...|    GENERAL|    9|
|      PHYS|      MSc in Physics|         SC|    3|
|      PHYS|      MSc in Physics|        OBC|    2|
|      PHYS|      MSc in Physics|    GENERAL|   26|
|    COMPSC|MSc in Big Data A...|         SC|    1|
|    COMPSC|MSc in Big Data A...|GENERAL-EWS|    2|
|    COMPSC|MSc in Big Data A...|    GENERAL|   11|
|    COMPSC|MSc in Computer S...|GENERAL-EWS|    1|
|    COMPSC|MSc in Computer S...|    GENERAL|    5|
|      SANS|Integrated MPhil-...|         SC|    1|
|      SANS|Integrated MPhil-...|        OBC|    5|
|      SANS|Integrated MPhil-...|    GENERAL|    2|
|    SSYOGA|  PG Diploma in Yoga|        OBC|    1|
|    SSYOGA|  PG Diploma in Yoga|    GENERAL|    2|
|    SSYOGA|

In [132]:
df_data2 = df_data.withColumn("total_new",col("Total").cast(IntegerType()))
df_data2.coalesce(1).write.option("header",True).csv('out/19-20/tmp')
df_data3 = spark.read.option("header",True).csv("out/19-20/tmp/*.csv")

#spark.udf.register("recreate_table", repeatName, StringType())
df_trimmed = df_data3.withColumn("total",col("total_new").cast(IntegerType()))
df_trimmed.show()

+----------+--------------------+-----------+-----+---------+
|Department|           Programme|   Category|total|total_new|
+----------+--------------------+-----------+-----+---------+
|      SANS|Integrated 5 Year...|         SC|    6|        6|
|      SANS|Integrated 5 Year...|        OBC|    2|        2|
|      SANS|Integrated 5 Year...|    GENERAL|    9|        9|
|      PHYS|      MSc in Physics|         SC|    3|        3|
|      PHYS|      MSc in Physics|        OBC|    2|        2|
|      PHYS|      MSc in Physics|    GENERAL|   26|       26|
|    COMPSC|MSc in Big Data A...|         SC|    1|        1|
|    COMPSC|MSc in Big Data A...|GENERAL-EWS|    2|        2|
|    COMPSC|MSc in Big Data A...|    GENERAL|   11|       11|
|    COMPSC|MSc in Computer S...|GENERAL-EWS|    1|        1|
|    COMPSC|MSc in Computer S...|    GENERAL|    5|        5|
|      SANS|Integrated MPhil-...|         SC|    1|        1|
|      SANS|Integrated MPhil-...|        OBC|    5|        5|
|      S

In [133]:
df_Gen_data = df_trimmed.filter(df_trimmed.Category == "GENERAL")
df_Gen_Ews_data = df_trimmed.filter(df_trimmed.Category == "GENERAL-EWS")
df_Sc_data = df_trimmed.filter(df_trimmed.Category == "SC")
df_St_data = df_trimmed.filter(df_trimmed.Category == "ST")
df_Obc_data = df_trimmed.filter(df_trimmed.Category == "OBC")

#df_Gen_data.show(truncate=False)
#df_Gen_Ews_data.show()
#df_Sc_data.show()
#df_St_data.show()
#df_Obc_data.show()

In [134]:
df_Gen = df_Gen_data.groupBy("Department").sum("total")
df_Gen_Ews = df_Gen_Ews_data.groupBy("Department").sum("total")
df_Sc = df_Sc_data.groupBy("Department").sum("total")
df_St = df_St_data.groupBy("Department").sum("total")
df_Obc = df_Obc_data.groupBy("Department").sum("total")
df_St.show(truncate=False)

+----------+----------+
|Department|sum(total)|
+----------+----------+
|SSYOGA    |1         |
|SANS      |1         |
+----------+----------+



In [135]:
df_Gen.coalesce(1).write.csv('out/19-20/Gen.csv')

In [136]:
df_Gen_Ews.coalesce(1).write.csv('out/19-20/Gen_Ews.csv')
df_Sc.coalesce(1).write.csv('out/19-20/Sc.csv')
df_St.coalesce(1).write.csv('out/19-20/St.csv')
df_Obc.coalesce(1).write.csv('out/19-20/Obc.csv')