# ANOVOS- TS Analyzer
**Following notebook shows the list of functions related to "ts_analyzer" module provided under ANOVOS package**
- [ts_processed_feats](#ts_processed_feats)
- [ts_eligiblity_check](#ts_eligiblity_check)
- [ts_viz_data](#ts_viz_data)
- [ts_analyzer](#ts_analyzer)

API specification of **ts_analyzer** module can be found here: [API Specification](https://docs.anovos.ai/api/data_analyzer/ts_analyzer.html)

**Setting Spark Session**

In [1]:
#set run type variable
run_type = "local" # "local", "emr", "databricks", "ak8s"

In [3]:
#For run_type Azure Kubernetes, run the following block 
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

if run_type == "ak8s":
    fs_path="<insert conf spark.hadoop.fs master url here> ex: spark.hadoop.fs.azure.sas.<container>.<account_name>.blob.core.windows.net"
    auth_key="<insert value of sas_token here>"
    master_url="<insert kubernetes master url path here> ex: k8s://"
    docker_image="<insert name docker image here>"
    kubernetes_namespace ="<insert kubernetes namespace here>"

    # Create Spark config for our Kubernetes based cluster manager
    sparkConf = SparkConf()
    sparkConf.setMaster(master_url)
    sparkConf.setAppName("Anovos_pipeline")
    sparkConf.set("spark.submit.deployMode","client")
    sparkConf.set("spark.kubernetes.container.image", docker_image)
    sparkConf.set("spark.kubernetes.namespace", kubernetes_namespace)
    sparkConf.set("spark.executor.instances", "4")
    sparkConf.set("spark.executor.cores", "4")
    sparkConf.set("spark.executor.memory", "16g")
    sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
    sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    sparkConf.set(fs_path,auth_key)
    sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
    sparkConf.set("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.2.0,com.microsoft.azure:azure-storage:8.6.3,io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20,org.apache.spark:spark-avro_2.12:3.2.1")

    # Initialize our Spark cluster, this will actually
    # generate the worker nodes.
    spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
    sc = spark.sparkContext

#For other run types import from anovos.shared.
else:
    from anovos.shared.spark import *
    auth_key = "NA"

**Input/Output Path**

In [4]:
inputPath = "../data/time_series_data/csv"
outputPath = "../output/time_series_data/ts_analyzer"

In [5]:
from anovos.data_ingest.data_ingest import read_dataset
from anovos.data_ingest.ts_auto_detection import ts_preprocess
from anovos.shared.utils import ends_with

In [6]:
df = read_dataset(spark, file_path = inputPath, file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": "," , 
                                                                           "inferSchema": "True"})
df.toPandas().head(5)

Unnamed: 0,STATE,YR,P_CAP,HWY,WATER,UTIL,PC,GSP,EMP,UNEMP
0,ALABAMA,1970,15032.67,7325.8,1655.68,6051.2,35793.8,28418,1010.5,4.7
1,ALABAMA,1971,15501.94,7525.94,1721.02,6254.98,37299.91,29375,1021.9,5.2
2,ALABAMA,1972,15972.41,7765.42,1764.75,6442.23,38670.3,31303,1072.3,4.7
3,ALABAMA,1973,16406.26,7907.66,1742.41,6756.19,40084.01,33430,1135.5,3.9
4,ALABAMA,1974,16762.67,8025.52,1734.85,7002.29,42057.31,33749,1169.8,5.5


In [7]:
df_preprocess = ts_preprocess(spark, idf=df, id_col='STATE', output_path= "../output/time_series_data/ts_autodetection", tz_offset="local", run_type=run_type, auth_key=auth_key)
df_preprocess.toPandas()

                                                                                

Unnamed: 0,STATE,P_CAP,HWY,WATER,UTIL,PC,GSP,EMP,UNEMP,YR
0,ALABAMA,15032.67,7325.80,1655.68,6051.20,35793.80,28418,1010.5,4.7,1970-01-01
1,ALABAMA,15501.94,7525.94,1721.02,6254.98,37299.91,29375,1021.9,5.2,1971-01-01
2,ALABAMA,15972.41,7765.42,1764.75,6442.23,38670.30,31303,1072.3,4.7,1972-01-01
3,ALABAMA,16406.26,7907.66,1742.41,6756.19,40084.01,33430,1135.5,3.9,1973-01-01
4,ALABAMA,16762.67,8025.52,1734.85,7002.29,42057.31,33749,1169.8,5.5,1974-01-01
...,...,...,...,...,...,...,...,...,...,...
811,WYOMING,4731.98,3060.64,408.43,1262.90,27724.96,13056,217.7,5.8,1982-01-01
812,WYOMING,4950.82,3119.98,445.59,1385.25,28586.46,11922,202.5,8.4,1983-01-01
813,WYOMING,5184.73,3195.68,476.57,1512.48,28794.80,12073,204.3,6.3,1984-01-01
814,WYOMING,5448.38,3295.92,523.01,1629.45,29326.94,12022,206.9,7.1,1985-01-01


In [8]:
from anovos.data_analyzer.ts_analyzer import daypart_cat,ts_processed_feats,ts_eligiblity_check,ts_viz_data,ts_analyzer

## ts_processed_feats
- API specification of function **ts_processed_feats** can be found <a href="https://docs.anovos.ai/api/data_analyzer/ts_analyzer.html">here</a>

In [9]:
df.select('STATE').distinct().count()

48

In [10]:
odf = ts_processed_feats(idf=df_preprocess, col='YR', id_col='STATE', tz='local', cnt_row=816, cnt_unique_id=48)
odf.toPandas()

                                                                                

Unnamed: 0,STATE,P_CAP,HWY,WATER,UTIL,PC,GSP,EMP,UNEMP,YR,...,YR_dayofmonth,dow,YR_dayofyear,YR_weekofyear,YR_month,YR_quarter,YR_year,yyyymmdd_col,daypart_cat,week_cat
0,ALABAMA,15032.67,7325.80,1655.68,6051.20,35793.80,28418,1010.5,4.7,1970-01-01,...,1,5,1,1,1,1,1970,1970-01-01,late_hours,weekday
1,ALABAMA,15501.94,7525.94,1721.02,6254.98,37299.91,29375,1021.9,5.2,1971-01-01,...,1,6,1,53,1,1,1971,1971-01-01,late_hours,weekend
2,ALABAMA,15972.41,7765.42,1764.75,6442.23,38670.30,31303,1072.3,4.7,1972-01-01,...,1,7,1,52,1,1,1972,1972-01-01,late_hours,weekend
3,ALABAMA,16406.26,7907.66,1742.41,6756.19,40084.01,33430,1135.5,3.9,1973-01-01,...,1,2,1,1,1,1,1973,1973-01-01,late_hours,weekday
4,ALABAMA,16762.67,8025.52,1734.85,7002.29,42057.31,33749,1169.8,5.5,1974-01-01,...,1,3,1,1,1,1,1974,1974-01-01,late_hours,weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,WYOMING,4731.98,3060.64,408.43,1262.90,27724.96,13056,217.7,5.8,1982-01-01,...,1,6,1,53,1,1,1982,1982-01-01,late_hours,weekend
812,WYOMING,4950.82,3119.98,445.59,1385.25,28586.46,11922,202.5,8.4,1983-01-01,...,1,7,1,52,1,1,1983,1983-01-01,late_hours,weekend
813,WYOMING,5184.73,3195.68,476.57,1512.48,28794.80,12073,204.3,6.3,1984-01-01,...,1,1,1,52,1,1,1984,1984-01-01,late_hours,weekday
814,WYOMING,5448.38,3295.92,523.01,1629.45,29326.94,12022,206.9,7.1,1985-01-01,...,1,3,1,1,1,1,1985,1985-01-01,late_hours,weekday


## ts_eligiblity_check
- API specification of function **ts_eligiblity_check** can be found <a href="https://docs.anovos.ai/api/data_analyzer/ts_analyzer.html">here</a>

In [11]:
#with mandatory arguments (rest arguments have default values)
odf1 = ts_eligiblity_check(spark, idf=odf, id_col='STATE')

22/11/21 12:01:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:01:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




22/11/21 12:01:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:01:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

22/11/21 12:01:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:01:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:01:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:01:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

In [12]:
odf1

Unnamed: 0,attribute,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
0,id_date_pair,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
1,date_id_pair,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0


## ts_viz_data
- API specification of function **ts_viz_data** can be found <a href="https://docs.anovos.ai/api/data_analyzer/ts_analyzer.html">here</a>

In [13]:
# Example 1 - using UTIL column as set in the Y-Axis and output type as daily
odf2=ts_viz_data(
    idf=odf,
    x_col='YR',
    y_col='UTIL',
    id_col='STATE',
    tz_offset="local",
    output_mode="append",
    output_type="daily",
    n_cat=10)

                                                                                

In [14]:
odf2

Unnamed: 0,YR,min,max,mean,median
0,1970-01-01,538.49,67746.79,9004.2975,5070.645
1,1971-01-01,618.38,69046.92,9450.582292,5347.75
2,1972-01-01,670.32,69592.17,9847.487917,5486.9
3,1973-01-01,675.66,71964.82,10224.325833,5625.1
4,1974-01-01,669.0,75762.25,10550.1075,5877.14
5,1975-01-01,713.06,78704.03,10910.621667,6111.205
6,1976-01-01,798.49,80642.95,11209.737917,6437.98
7,1977-01-01,794.41,80728.14,11415.08125,6818.57
8,1978-01-01,774.08,80566.85,11537.139167,7286.55
9,1979-01-01,790.48,79961.03,11699.751042,7816.545


In [15]:
# Example 2 - using UTIL column as set in the Y-Axis and output type as weekly
odf3=ts_viz_data(
    idf=odf,
    x_col='YR',
    y_col='UTIL',
    id_col='STATE',
    tz_offset="local",
    output_mode="append",
    output_type="weekly",
    n_cat=10)

                                                                                

In [16]:
odf3

Unnamed: 0,dow,min,max,mean,median
0,1,774.08,80566.85,11825.344063,7572.94
1,2,675.66,79961.03,10962.038437,6841.47
2,3,669.0,78888.13,11520.208056,7512.32
3,4,713.06,78704.03,11600.567187,7251.84
4,5,538.49,80642.95,10731.324861,6437.98
5,6,618.38,76328.03,10754.948437,6912.56
6,7,670.32,80728.14,11116.748403,6818.57


**Note:** we can better analyze by changing output_type as hourly if timestamp columns will be present in dataset

## ts_analyzer
- API specification of function **ts_analyzer** can be found <a href="https://docs.anovos.ai/api/data_analyzer/ts_analyzer.html">here</a>

In [17]:
# Example 1 - with output_type as daily and maximum days as 3600
ts_analyzer(spark,idf=df_preprocess,id_col="STATE",max_days=3600,output_path= ends_with(outputPath) + ends_with("daily"),output_type="daily", run_type=run_type, auth_key=auth_key)


22/11/21 12:02:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




22/11/21 12:02:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

22/11/21 12:02:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

22/11/21 12:02:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 1

                                                                                

In [18]:
# Values at different percentiles 
df_ts = read_dataset(spark, file_path = ends_with(outputPath) + ends_with("daily") +"stats_YR_1.csv", file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": ","})
df_ts.toPandas()

Unnamed: 0,attribute,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
0,id_date_pair,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
1,date_id_pair,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0


In [19]:
# Statistical properties of column belonging to timestamp / date
df_stats = read_dataset(spark, file_path = ends_with(outputPath) + ends_with("daily") +"stats_YR_2.csv", file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": ","})
df_stats.toPandas()

Unnamed: 0,count_unique_dates,min_date,max_date,modal_date,date_diff,missing_date,mean,variance,stdev,cov
0,17,1970-01-01,1986-01-01,1974-01-01 [48],5844,0,365.25,0.2,0.447,0.001


In [20]:
#using the output type as daily for GSP column
df_daily = read_dataset(spark, file_path = ends_with(outputPath) + ends_with("daily") +"YR_GSP_daily.csv", file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": ","})
df_daily.toPandas()

Unnamed: 0,YR,min,max,mean,median
0,1970-01-01,4354,263933,48879.645833333336,30677.5
1,1971-01-01,4438,265600,49989.66666666666,31325.5
2,1972-01-01,4611,281159,52682.125,33246.5
3,1973-01-01,4801,293735,55702.708333333336,35065.5
4,1974-01-01,4652,298408,55308.833333333336,34929.5
5,1975-01-01,4528,304518,54403.333333333336,35172.5
6,1976-01-01,4760,320160,56969.270833333336,36570.0
7,1977-01-01,4913,338040,59697.708333333336,38349.0
8,1978-01-01,5363,359603,62773.79166666666,41043.0
9,1979-01-01,5569,374928,64317.4375,43226.0


In [21]:
#using the output type as daily for WATER column
df_daily = read_dataset(spark, file_path = ends_with(outputPath) + ends_with("daily") +"YR_WATER_daily.csv", file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": ","})
df_daily.toPandas()

Unnamed: 0,YR,min,max,mean,median
0,1970-01-01,234.23,17837.26,2806.8247916666664,1703.44
1,1971-01-01,230.51,18448.04,2887.0727083333327,1724.11
2,1972-01-01,233.59,18813.82,2960.7177083333336,1784.48
3,1973-01-01,236.98,19084.52,3029.9502083333323,1783.585
4,1974-01-01,228.46,19092.78,3105.404583333333,1817.725
5,1975-01-01,229.86,19654.53,3227.6547916666677,1935.805
6,1976-01-01,249.94,20346.64,3364.4035416666684,2067.145
7,1977-01-01,262.76,21014.45,3497.091666666667,2176.655
8,1978-01-01,255.73,21556.85,3620.605625,2251.625
9,1979-01-01,261.21,22297.31,3782.025624999999,2366.515


In [22]:
# Example 2 - with output_type as weekly and maximum days as 3600
ts_analyzer(spark,idf=df_preprocess,id_col='STATE',max_days=3600,output_path=ends_with(outputPath) + ends_with("weekly"),output_type="weekly", run_type=run_type, auth_key=auth_key)
df_preprocess.toPandas()

22/11/21 12:02:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.




22/11/21 12:02:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

22/11/21 12:02:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

22/11/21 12:02:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/11/21 12:02:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

Unnamed: 0,STATE,P_CAP,HWY,WATER,UTIL,PC,GSP,EMP,UNEMP,YR
0,ALABAMA,15032.67,7325.80,1655.68,6051.20,35793.80,28418,1010.5,4.7,1970-01-01
1,ALABAMA,15501.94,7525.94,1721.02,6254.98,37299.91,29375,1021.9,5.2,1971-01-01
2,ALABAMA,15972.41,7765.42,1764.75,6442.23,38670.30,31303,1072.3,4.7,1972-01-01
3,ALABAMA,16406.26,7907.66,1742.41,6756.19,40084.01,33430,1135.5,3.9,1973-01-01
4,ALABAMA,16762.67,8025.52,1734.85,7002.29,42057.31,33749,1169.8,5.5,1974-01-01
...,...,...,...,...,...,...,...,...,...,...
811,WYOMING,4731.98,3060.64,408.43,1262.90,27724.96,13056,217.7,5.8,1982-01-01
812,WYOMING,4950.82,3119.98,445.59,1385.25,28586.46,11922,202.5,8.4,1983-01-01
813,WYOMING,5184.73,3195.68,476.57,1512.48,28794.80,12073,204.3,6.3,1984-01-01
814,WYOMING,5448.38,3295.92,523.01,1629.45,29326.94,12022,206.9,7.1,1985-01-01


In [23]:
#using the output type as weekly for GSP column
df_weekly = read_dataset(spark, file_path = ends_with(outputPath) + ends_with("weekly") +"YR_GSP_weekly.csv", file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": ","})
df_weekly.toPandas()

Unnamed: 0,dow,min,max,mean,median
0,1,5363,420525,66544.67708333333,43971.0
1,2,4801,374928,60010.07291666666,40421.0
2,3,4652,444082,64041.506944444445,42844.5
3,4,4528,464550,64930.91666666666,41261.5
4,5,4354,378436,56887.32638888889,36570.0
5,6,4438,372541,56753.364583333336,37202.5
6,7,4611,390528,59326.29166666666,38349.0


In [24]:
#using the output type as weekly for WATER column
df_weekly = read_dataset(spark, file_path = ends_with(outputPath) + ends_with("weekly") +"YR_WATER_weekly.csv", file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": ","})
df_weekly.toPandas()

Unnamed: 0,dow,min,max,mean,median
0,1,255.73,23841.32,3929.946249999999,2462.5150000000003
1,2,236.98,22297.31,3405.9879166666665,2169.46
2,3,228.46,24121.33,3775.534027777781,2359.285
3,4,229.86,24592.33,3797.104270833335,2467.11
4,5,234.23,23291.36,3410.3939583333326,2159.79
5,6,230.51,23502.43,3515.676770833332,2191.025
6,7,233.59,23828.67,3554.705694444444,2206.17


**Note:** we can better analyze by changing output_type as hourly if timestamp columns will be present in dataset