In [1]:
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
df = spark.read.option("header",True).csv("data/Programme_Wise_Graduated_Student_Category_Stats-18-19.csv")
df.printSchema()

root
 |-- Programme: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Male: string (nullable = true)
 |-- Female: string (nullable = true)
 |-- Others: string (nullable = true)
 |-- Total: string (nullable = true)



In [2]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
df2 = df.withColumn("total_new",col("Total").cast(IntegerType()))

In [3]:
df2.groupBy("programme").sum("total_new").show(truncate=False)

+---------------------------------------------------------+--------------+
|programme                                                |sum(total_new)|
+---------------------------------------------------------+--------------+
|Total                                                    |332           |
|MSc in Agriculture and Rural Development                 |14            |
|PG Diploma in Post Harvest & Food Technology             |6             |
|B.Ed in Special Education (HI)                           |2             |
|MSc in Physics                                           |11            |
|BSc in Physical Education, Health Education and Sports   |1             |
|null                                                     |161           |
|PG Diploma in Yoga                                       |4             |
|MSc in Mathematics                                       |8             |
|Integrated MPhil-PhD in Sanskrit                         |1             |
|MSc in Agricultural Biot

In [4]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
acc = spark.sparkContext.accumulator(0)
#spark.udf.register("recreate_table", repeatName, StringType())

@udf(returnType=StringType()) 
def repeatName(str):
    global acc
    #return str
    print (str)
    if (not str):
        return acc
    else:
        acc = str
        return acc

df3 = df2.withColumn("Name", repeatName(col("Programme")))
df3.show(truncate=False)

+------------------------------------------+--------+----+------+------+-----+---------+------------------------------------------+
|Programme                                 |Category|Male|Female|Others|Total|total_new|Name                                      |
+------------------------------------------+--------+----+------+------+-----+---------+------------------------------------------+
|MSc in Physics                            |GENERAL |11  |0     |0     |11   |11       |MSc in Physics                            |
|Integrated 5 Year MA in Sanskrit(M)       |GENERAL |8   |0     |0     |8    |8        |Integrated 5 Year MA in Sanskrit(M)       |
|null                                      |OBC     |6   |0     |0     |6    |6        |Integrated 5 Year MA in Sanskrit(M)       |
|null                                      |SC      |2   |0     |0     |2    |2        |Integrated 5 Year MA in Sanskrit(M)       |
|MSc in Mathematics                        |GENERAL |5   |3     |0     |8   

In [5]:
df4 = df3.select("Name", "Category", "Total")
df4.show(truncate=False)

+------------------------------------------+--------+-----+
|Name                                      |Category|Total|
+------------------------------------------+--------+-----+
|MSc in Physics                            |GENERAL |11   |
|Integrated 5 Year MA in Sanskrit(M)       |GENERAL |8    |
|Integrated 5 Year MA in Sanskrit(M)       |OBC     |6    |
|Integrated 5 Year MA in Sanskrit(M)       |SC      |2    |
|MSc in Mathematics                        |GENERAL |8    |
|MSc in Mathematics                        |OBC     |2    |
|MSc in Computer Science                   |GENERAL |4    |
|MSc in Computer Science                   |SC      |1    |
|MSc in Big Data Analytics                 |GENERAL |7    |
|Integrated MPhil-PhD in Sanskrit          |GENERAL |1    |
|PhD in Physics                            |GENERAL |2    |
|PG Diploma in Yoga                        |GENERAL |4    |
|PG Diploma in Yoga                        |OBC     |3    |
|PG Diploma in Yoga                     

In [6]:
df_dept = spark.read.option("header",True).csv("data/Programme_List.csv")
df_dept.show()

+--------------------+-------------+-------------+-------------+------------+----------+--------+--------------+--------------+
|           Programme|         Code|Duration (Yr)|Maximum Marks|      Campus|Department|  Degree|Faculty Center|No of Students|
+--------------------+-------------+-------------+-------------+------------+----------+--------+--------------+--------------+
|Integrated 5 Year...|      SAN-MA5|        5 - 6|          100|Belur Campus|      SANS|5YrIntMA|          null|            73|
|      MSc in Physics|         PHY1|        2 - 2|          100|Belur Campus|      PHYS|     MSC|          null|            63|
|MSc in Big Data A...|          BDA|        2 - 3|          100|Belur Campus|    COMPSC|     MSC|          null|            37|
|MSc in Computer S...|          CS1|        2 - 3|          100|Belur Campus|    COMPSC|     MSC|          null|            24|
|Integrated MPhil-...|SANS-Intd.PHD|       5 - 10|          100|Belur Campus|      SANS|     PhD|       

In [7]:
df_dept_stat = df4.join(df_dept,df4.Name ==  df_dept.Programme,"inner")

In [8]:
df_data = df_dept_stat.select("Department", "Programme", "Category","Total")
df_data.show()

+----------+--------------------+--------+-----+
|Department|           Programme|Category|Total|
+----------+--------------------+--------+-----+
|      SANS|Integrated 5 Year...|      SC|    2|
|      SANS|Integrated 5 Year...|     OBC|    6|
|      SANS|Integrated 5 Year...| GENERAL|    8|
|      PHYS|      MSc in Physics| GENERAL|   11|
|    COMPSC|MSc in Big Data A...| GENERAL|    7|
|    COMPSC|MSc in Computer S...|      SC|    1|
|    COMPSC|MSc in Computer S...| GENERAL|    4|
|      SANS|Integrated MPhil-...| GENERAL|    1|
|    SSYOGA|  PG Diploma in Yoga|      SC|    5|
|    SSYOGA|  PG Diploma in Yoga|     OBC|    3|
|    SSYOGA|  PG Diploma in Yoga| GENERAL|    4|
|     MATHS|  MSc in Mathematics|     OBC|    2|
|     MATHS|  MSc in Mathematics| GENERAL|    8|
|      PHYS|      PhD in Physics| GENERAL|    2|
+----------+--------------------+--------+-----+



In [9]:
df_data2 = df_data.withColumn("total_new",col("Total").cast(IntegerType()))
df_data2.coalesce(1).write.option("header",True).csv('out/18-19/tmp')
df_data3 = spark.read.option("header",True).csv("out/18-19/tmp/*.csv")

#spark.udf.register("recreate_table", repeatName, StringType())
df_trimmed = df_data3.withColumn("total",col("total_new").cast(IntegerType()))
df_trimmed.show()

+----------+--------------------+--------+-----+---------+
|Department|           Programme|Category|total|total_new|
+----------+--------------------+--------+-----+---------+
|      SANS|Integrated 5 Year...|      SC|    2|        2|
|      SANS|Integrated 5 Year...|     OBC|    6|        6|
|      SANS|Integrated 5 Year...| GENERAL|    8|        8|
|      PHYS|      MSc in Physics| GENERAL|   11|       11|
|    COMPSC|MSc in Big Data A...| GENERAL|    7|        7|
|    COMPSC|MSc in Computer S...|      SC|    1|        1|
|    COMPSC|MSc in Computer S...| GENERAL|    4|        4|
|      SANS|Integrated MPhil-...| GENERAL|    1|        1|
|    SSYOGA|  PG Diploma in Yoga|      SC|    5|        5|
|    SSYOGA|  PG Diploma in Yoga|     OBC|    3|        3|
|    SSYOGA|  PG Diploma in Yoga| GENERAL|    4|        4|
|     MATHS|  MSc in Mathematics|     OBC|    2|        2|
|     MATHS|  MSc in Mathematics| GENERAL|    8|        8|
|      PHYS|      PhD in Physics| GENERAL|    2|        

In [10]:
df_Gen_data = df_trimmed.filter(df_trimmed.Category == "GENERAL")
df_Gen_Ews_data = df_trimmed.filter(df_trimmed.Category == "GENERAL-EWS")
df_Sc_data = df_trimmed.filter(df_trimmed.Category == "SC")
df_St_data = df_trimmed.filter(df_trimmed.Category == "ST")
df_Obc_data = df_trimmed.filter(df_trimmed.Category == "OBC")

#df_Gen_data.show(truncate=False)
#df_Gen_Ews_data.show()
#df_Sc_data.show()
#df_St_data.show()
#df_Obc_data.show()

In [11]:
df_Gen = df_Gen_data.groupBy("Department").sum("total")
df_Gen_Ews = df_Gen_Ews_data.groupBy("Department").sum("total")
df_Sc = df_Sc_data.groupBy("Department").sum("total")
df_St = df_St_data.groupBy("Department").sum("total")
df_Obc = df_Obc_data.groupBy("Department").sum("total")
df_St.show(truncate=False)

+----------+----------+
|Department|sum(total)|
+----------+----------+
+----------+----------+



In [12]:
df_Gen.coalesce(1).write.csv('out/18-19/Gen.csv')

In [13]:
df_Gen_Ews.coalesce(1).write.csv('out/18-19/Gen_Ews.csv')
df_Sc.coalesce(1).write.csv('out/18-19/Sc.csv')
df_St.coalesce(1).write.csv('out/18-19/St.csv')
df_Obc.coalesce(1).write.csv('out/18-19/Obc.csv')