## Setup

In [1]:
import pyspark
from pyspark.sql import SparkSession, SQLContext, Row
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = SparkSession.builder.appName("WeatherAnalysis").getOrCreate()

In [3]:
spark

## Loading WeatherStations Data

In [4]:
weatherStationsSchema = T.StructType([
    T.StructField("State",T.StringType()),
    T.StructField("District",T.StringType()),
    T.StructField("Latitude",T.DoubleType()),
    T.StructField("Longitude",T.DoubleType()),
    T.StructField("StationID",T.StringType()),
])

In [5]:
weatherStations = spark.read.load(path = "Datasets/Output/WeatherStationsId.csv",
                                  format = "csv",header=True,
                                  schema = weatherStationsSchema)

In [6]:
weatherStations.printSchema()

root
 |-- State: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- StationID: string (nullable = true)



In [7]:
weatherStations.show()

+---------------+--------------+------------------+---------+---------+
|          State|      District|          Latitude|Longitude|StationID|
+---------------+--------------+------------------+---------+---------+
|      TELANGANA|      NALGONDA|  16.3920001983643|   79.375|   164794|
|          BIHAR|CHAMPARAN.WEST|  27.3199996948242|  83.4375|   273834|
|          BIHAR|   MUZAFFARPUR|  25.4466991424561|  85.3125|   254853|
|          BIHAR|    SAMASTIPUR|  25.4466991424561|  85.3125|   254853|
|        GUJARAT|     AHMEDABAD|  22.9487991333008|     72.5|   229725|
|      KARNATAKA|     BAGALKOTE|  16.0797996520996|   75.625|   161756|
|      KARNATAKA|BANGALORE URBA|12.957500457763699|     77.5|   130775|
|         KERALA|        IDUKKI|10.147399902343802|  77.1875|   101772|
|MADHAYA PRADESH|   NARSINGHPUR|  7.02514982223511|   79.375|    70794|
|    MAHARASHTRA|    AHMEDNAGAR|   18.577600479126|  75.3125|   186753|
|    MAHARASHTRA|       YEOTMAL|  20.1387996673584|  78.4375|   

## Get list of relevant data files to read
~~~Text
The weather information for each station is in a seperate file.<br>
Generating list of data files to read.<br>

Each filename is of the format:
            weatherdata-<id>.csv
where id is the unique identifier
~~~

In [8]:
#Get list of unique StationID in the data
stationIDs = weatherStations.select("StationID").distinct()

In [9]:
#Get count of unique StationID
stationIDs.count()

1340

In [10]:
#Generating List
stationIDs = [row["StationID"] for row in stationIDs.collect()]

In [11]:
stationIDs[:5]

['151772', '220725', '139744', '192750', '245841']

In [12]:
#Generating filenames 
filenames = ["weatherdata-{ID}.csv".format(ID=stationId) for stationId in stationIDs]

In [13]:
filenames[:5]

['weatherdata-151772.csv',
 'weatherdata-220725.csv',
 'weatherdata-139744.csv',
 'weatherdata-192750.csv',
 'weatherdata-245841.csv']

## Loading WeatherData files

In [14]:
#Funtion to format date to required format
def formatDate(date):
    month,day,year = date.split("/")
    
    if len(day)==1: 
        day="0"+day
    
    if len(month)==1: 
        month="0"+month
    
    return "-".join([year,month,day])

In [15]:
#Creating UDF
formatDate=F.udf(formatDate,T.StringType())

In [16]:
#Testing the UDF and transformations on one file
weatherDataDF = spark.read.load(
                    path = "Datasets/2000_2013/files/"+filenames[0],
                    format="csv", header=True)

weatherDataDF = ( weatherDataDF.select("Date",F.col("Max Temperature").cast(T.DoubleType()),
                                         F.col("Min Temperature").cast(T.DoubleType()),
                                         F.col("Precipitation").cast(T.DoubleType()),
                                         F.col("Relative Humidity").cast(T.DoubleType()))
                              .withColumn("StationId",F.lit(stationIDs[0]))
                              .withColumn("Date",F.to_date(formatDate(F.col("Date")),format="yyyy-MM-dd")))


In [17]:
#checcking schema
weatherDataDF.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Max Temperature: double (nullable = true)
 |-- Min Temperature: double (nullable = true)
 |-- Precipitation: double (nullable = true)
 |-- Relative Humidity: double (nullable = true)
 |-- StationId: string (nullable = false)



In [18]:
weatherDataDF.describe().show()

+-------+-----------------+------------------+-----------------+-------------------+---------+
|summary|  Max Temperature|   Min Temperature|    Precipitation|  Relative Humidity|StationId|
+-------+-----------------+------------------+-----------------+-------------------+---------+
|  count|             5114|              5114|             5114|               5114|     5114|
|   mean|33.54038815017596|21.566215095815412|1.947447703802815| 0.5326649542642927| 151772.0|
| stddev|4.450241019026345|3.7775547416558575|6.534890647535881|0.18298700348943317|      0.0|
|    min|           21.061|            10.409|              0.0| 0.0889887002959665|   151772|
|    max|           46.742|            30.337|      142.3073088|  0.953042742704384|   151772|
+-------+-----------------+------------------+-----------------+-------------------+---------+



In [19]:
#check data
weatherDataDF.show()

+----------+---------------+---------------+-------------+-----------------+---------+
|      Date|Max Temperature|Min Temperature|Precipitation|Relative Humidity|StationId|
+----------+---------------+---------------+-------------+-----------------+---------+
|2000-01-01|         28.874|         14.754|          0.0|0.529431046214477|   151772|
|2000-01-02|         29.225|         14.758|          0.0| 0.57896803144991|   151772|
|2000-01-03|         29.165|         14.417|          0.0|0.619197799378162|   151772|
|2000-01-04|         29.341|         16.189|          0.0|0.604262783080777|   151772|
|2000-01-05|         29.453|         16.164|          0.0|0.590774879511934|   151772|
|2000-01-06|         30.316|         16.093|          0.0|0.553721838117377|   151772|
|2000-01-07|         30.326|         14.239|          0.0|0.299192999999843|   151772|
|2000-01-08|         29.333|         15.986|  0.291824604|0.606178057238691|   151772|
|2000-01-09|         31.004|         15.086

In [20]:
#Clubbing all weatherdata files into one single file
for file,Id in zip(filenames,stationIDs):
    
    #load data to dataframe
    df = spark.read.load(
            path = "Datasets/2000_2013/files/"+file,
            format="csv", header=True)
    
    #transforming data to required format
    df = ( df.select("Date",F.col("Max Temperature").cast(T.DoubleType()),
                     F.col("Min Temperature").cast(T.DoubleType()),
                     F.col("Precipitation").cast(T.DoubleType()),
                     F.col("Relative Humidity").cast(T.DoubleType()))
          .withColumn("StationId",F.lit(Id))
          .withColumn("Date",F.to_date(formatDate(F.col("Date")),format="yyyy-MM-dd")))
    
    #writing data to storage
    #using append mode to generate one file
    df.write.csv(path = "Datasets/Output/WeatherData.csv",
                 mode="append",header=True,)