$ pip install ts-flint pyarrow

Download flint-0.6.0.jar and grizzled-slf4j_2.11-1.3.4.jar from maven


```
export SPARK_HOME=/Users/abasar/Applications/spark-2.4.4-bin-hadoop2.7
export PYSPARK_PYTHON=python3
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.ip='*' --NotebookApp.port=8888 --NotebookApp.open_browser=False"
pyspark --jars /Users/abasar/Downloads/flint-0.6.0.jar,/Users/abasar/Downloads/grizzled-slf4j_2.11-1.3.4.jar

```


Flint docs https://ts-flint.readthedocs.io/en/latest/


Important source code: ~/anaconda3/lib/python3.7/site-packages/ts/flint/dataframe.py 

In [1]:
spark

In [2]:
from pyspark.sql.functions import * 

In [3]:
df = (spark.read.options(header=True, inferSchema = True).csv("/Users/abasar/data/stocks.csv")
.filter("symbol like 'GE'")
)

df.show()

+-------------------+-------+-------+-------+-------+---------+---------+------+
|               date|   open|   high|    low|  close|   volume| adjclose|symbol|
+-------------------+-------+-------+-------+-------+---------+---------+------+
|2000-07-17 00:00:00|51.5625|54.4375|51.5625|53.6875|1.83696E7|32.541822|    GE|
|2000-07-18 00:00:00| 53.625|  53.75|  52.25|  52.25|1.19875E7|31.670504|    GE|
|2000-07-19 00:00:00|52.9375|53.3125|52.5625|  52.75|9531300.0|31.973572|    GE|
|2000-07-20 00:00:00|53.1875|  54.75|52.8125|54.3125|1.12205E7|32.920656|    GE|
|2000-07-21 00:00:00| 54.625|  54.75| 53.625| 54.125|1.30197E7|32.807006|    GE|
|2000-07-24 00:00:00|54.1875| 54.625| 53.625|   54.0|9440000.0|32.731239|    GE|
|2000-07-25 00:00:00|   54.0|54.1875|   53.5|53.5625|8467800.0|32.466055|    GE|
|2000-07-26 00:00:00|53.9375|53.9375| 52.125| 52.125|1.67653E7|31.594738|    GE|
|2000-07-27 00:00:00| 52.625|53.0625|  52.25|   52.5|1.37866E7|31.822038|    GE|
|2000-07-28 00:00:00|   52.5

In [4]:
df_return = df.withColumn("return", expr("(close-open) * 100 / open"))
df_return = df_return.selectExpr("date as time", "return")
df_return.show()

+-------------------+-------------------+
|               time|             return|
+-------------------+-------------------+
|2000-07-17 00:00:00|  4.121212121212121|
|2000-07-18 00:00:00|-2.5641025641025643|
|2000-07-19 00:00:00|-0.3541912632821724|
|2000-07-20 00:00:00| 2.1151586368977675|
|2000-07-21 00:00:00|-0.9153318077803204|
|2000-07-24 00:00:00|-0.3460207612456747|
|2000-07-25 00:00:00|-0.8101851851851852|
|2000-07-26 00:00:00|-3.3603707995365006|
|2000-07-27 00:00:00|-0.2375296912114014|
|2000-07-28 00:00:00|-2.9761904761904763|
|2000-07-31 00:00:00|0.12106537530266344|
|2000-08-01 00:00:00| 1.5643802647412757|
|2000-08-02 00:00:00| -0.834326579261025|
|2000-08-03 00:00:00|-1.8561484918793503|
|2000-08-04 00:00:00|  2.891566265060241|
|2000-08-07 00:00:00| 0.9592326139088729|
|2000-08-08 00:00:00| 1.2956419316843346|
|2000-08-09 00:00:00|   3.82830626450116|
|2000-08-10 00:00:00| 2.0134228187919465|
|2000-08-11 00:00:00|  0.779510022271715|
+-------------------+-------------

In [5]:
from ts.flint import FlintContext
from ts.flint import windows
flintContext = FlintContext(sqlContext)

In [6]:
df_return = flintContext.read.dataframe(df_return)

In [7]:
type(df_return)

ts.flint.dataframe.TimeSeriesDataFrame

In [8]:
df_prev_day_return = (df_return
                        .shiftTime(windows.future_absolute_time('1day'))
                        .toDF("time", "return_prev_day")
                     )

In [9]:
df_prev_day_return.show()

+-------------------+-------------------+
|               time|    return_prev_day|
+-------------------+-------------------+
|2000-07-18 00:00:00|  4.121212121212121|
|2000-07-19 00:00:00|-2.5641025641025643|
|2000-07-20 00:00:00|-0.3541912632821724|
|2000-07-21 00:00:00| 2.1151586368977675|
|2000-07-22 00:00:00|-0.9153318077803204|
|2000-07-25 00:00:00|-0.3460207612456747|
|2000-07-26 00:00:00|-0.8101851851851852|
|2000-07-27 00:00:00|-3.3603707995365006|
|2000-07-28 00:00:00|-0.2375296912114014|
|2000-07-29 00:00:00|-2.9761904761904763|
|2000-08-01 00:00:00|0.12106537530266344|
|2000-08-02 00:00:00| 1.5643802647412757|
|2000-08-03 00:00:00| -0.834326579261025|
|2000-08-04 00:00:00|-1.8561484918793503|
|2000-08-05 00:00:00|  2.891566265060241|
|2000-08-08 00:00:00| 0.9592326139088729|
|2000-08-09 00:00:00| 1.2956419316843346|
|2000-08-10 00:00:00|   3.82830626450116|
|2000-08-11 00:00:00| 2.0134228187919465|
|2000-08-12 00:00:00|  0.779510022271715|
+-------------------+-------------

In [10]:
df_joined = df_return.leftJoin(df_prev_day_return,  key = "time")
df_joined.withColumn("day", expr("date_format(time, 'E')")).show()

+-------------------+-------------------+-------------------+---+
|               time|             return|    return_prev_day|day|
+-------------------+-------------------+-------------------+---+
|2000-07-17 00:00:00|  4.121212121212121|               null|Mon|
|2000-07-18 00:00:00|-2.5641025641025643|  4.121212121212121|Tue|
|2000-07-19 00:00:00|-0.3541912632821724|-2.5641025641025643|Wed|
|2000-07-20 00:00:00| 2.1151586368977675|-0.3541912632821724|Thu|
|2000-07-21 00:00:00|-0.9153318077803204| 2.1151586368977675|Fri|
|2000-07-24 00:00:00|-0.3460207612456747|               null|Mon|
|2000-07-25 00:00:00|-0.8101851851851852|-0.3460207612456747|Tue|
|2000-07-26 00:00:00|-3.3603707995365006|-0.8101851851851852|Wed|
|2000-07-27 00:00:00|-0.2375296912114014|-3.3603707995365006|Thu|
|2000-07-28 00:00:00|-2.9761904761904763|-0.2375296912114014|Fri|
|2000-07-31 00:00:00|0.12106537530266344|               null|Mon|
|2000-08-01 00:00:00| 1.5643802647412757|0.12106537530266344|Tue|
|2000-08-0

In [11]:
df_joined = df_return.leftJoin(df_prev_day_return, tolerance="3days")
df_joined.withColumn("day", expr("date_format(time, 'E')")).show()

+-------------------+-------------------+-------------------+---+
|               time|             return|    return_prev_day|day|
+-------------------+-------------------+-------------------+---+
|2000-07-17 00:00:00|  4.121212121212121|               null|Mon|
|2000-07-18 00:00:00|-2.5641025641025643|  4.121212121212121|Tue|
|2000-07-19 00:00:00|-0.3541912632821724|-2.5641025641025643|Wed|
|2000-07-20 00:00:00| 2.1151586368977675|-0.3541912632821724|Thu|
|2000-07-21 00:00:00|-0.9153318077803204| 2.1151586368977675|Fri|
|2000-07-24 00:00:00|-0.3460207612456747|-0.9153318077803204|Mon|
|2000-07-25 00:00:00|-0.8101851851851852|-0.3460207612456747|Tue|
|2000-07-26 00:00:00|-3.3603707995365006|-0.8101851851851852|Wed|
|2000-07-27 00:00:00|-0.2375296912114014|-3.3603707995365006|Thu|
|2000-07-28 00:00:00|-2.9761904761904763|-0.2375296912114014|Fri|
|2000-07-31 00:00:00|0.12106537530266344|-2.9761904761904763|Mon|
|2000-08-01 00:00:00| 1.5643802647412757|0.12106537530266344|Tue|
|2000-08-0

In [12]:
df_joined = df_joined.dropna()
df_joined.show()

+-------------------+-------------------+-------------------+
|               time|             return|    return_prev_day|
+-------------------+-------------------+-------------------+
|2000-07-18 00:00:00|-2.5641025641025643|  4.121212121212121|
|2000-07-19 00:00:00|-0.3541912632821724|-2.5641025641025643|
|2000-07-20 00:00:00| 2.1151586368977675|-0.3541912632821724|
|2000-07-21 00:00:00|-0.9153318077803204| 2.1151586368977675|
|2000-07-24 00:00:00|-0.3460207612456747|-0.9153318077803204|
|2000-07-25 00:00:00|-0.8101851851851852|-0.3460207612456747|
|2000-07-26 00:00:00|-3.3603707995365006|-0.8101851851851852|
|2000-07-27 00:00:00|-0.2375296912114014|-3.3603707995365006|
|2000-07-28 00:00:00|-2.9761904761904763|-0.2375296912114014|
|2000-07-31 00:00:00|0.12106537530266344|-2.9761904761904763|
|2000-08-01 00:00:00| 1.5643802647412757|0.12106537530266344|
|2000-08-02 00:00:00| -0.834326579261025| 1.5643802647412757|
|2000-08-03 00:00:00|-1.8561484918793503| -0.834326579261025|
|2000-08

### Built in summarizer

In [13]:
from ts.flint import summarizers

In [19]:
df_decayed_return = df_joined.summarizeWindows(
    window = windows.past_absolute_time("7days"),
    summarizer=summarizers.ewma("return_prev_day", alpha=0.5)   
)

df_decayed_return.show()

+-------------------+-------------------+-------------------+--------------------+
|               time|             return|    return_prev_day|return_prev_day_ewma|
+-------------------+-------------------+-------------------+--------------------+
|2000-07-18 00:00:00|-2.5641025641025643|  4.121212121212121|   4.121212121212121|
|2000-07-19 00:00:00|-0.3541912632821724|-2.5641025641025643| -0.5034965034965038|
|2000-07-20 00:00:00| 2.1151586368977675|-0.3541912632821724| -0.6059395150304243|
|2000-07-21 00:00:00|-0.9153318077803204| 2.1151586368977675|  1.8121888793825554|
|2000-07-24 00:00:00|-0.3460207612456747|-0.9153318077803204| -0.6888081978575009|
|2000-07-25 00:00:00|-0.8101851851851852|-0.3460207612456747| -0.6904248601744252|
|2000-07-26 00:00:00|-3.3603707995365006|-0.8101851851851852| -1.1714961001208826|
|2000-07-27 00:00:00|-0.2375296912114014|-3.3603707995365006| -3.9361028239559164|
|2000-07-28 00:00:00|-2.9761904761904763|-0.2375296912114014| -2.2041975435671635|
|200

In [15]:
df_joined[["return_prev_day"]].show()

+-------------------+
|    return_prev_day|
+-------------------+
|  4.121212121212121|
|-2.5641025641025643|
|-0.3541912632821724|
| 2.1151586368977675|
|-0.9153318077803204|
|-0.3460207612456747|
|-0.8101851851851852|
|-3.3603707995365006|
|-0.2375296912114014|
|-2.9761904761904763|
|0.12106537530266344|
| 1.5643802647412757|
| -0.834326579261025|
|-1.8561484918793503|
|  2.891566265060241|
| 0.9592326139088729|
| 1.2956419316843346|
|   3.82830626450116|
| 2.0134228187919465|
|  0.779510022271715|
+-------------------+
only showing top 20 rows



In [16]:
import numpy as np

### User definer summarizer

In [18]:
"""
from ts.flint import udf

@udf('double', arg_type='numpy')
def decayed(columns): 
    v = columns[0]
    decay = np.power(0.5, np.arange(len(v)))[::-1]
    return (v * decay).sum()



df_decayed_return = df_joined.summarizeWindows(window= windows.past_absolute_time("7day"),  
                           summarizer= {"return_ewm": decayed(df_joined[["return_prev_day"]])})

df_decayed_return.show()

"""

'\nfrom ts.flint import udf\n\n@udf(\'double\', arg_type=\'numpy\')\ndef decayed(columns): \n    v = columns[0]\n    decay = np.power(0.5, np.arange(len(v)))[::-1]\n    return (v * decay).sum()\n\n\n\ndf_decayed_return = df_joined.summarizeWindows(window= windows.past_absolute_time("7day"),  \n                           summarizer= {"return_ewm": decayed(df_joined[["return_prev_day"]])})\n\ndf_decayed_return.show()\n\n'

In [18]:
from ts.flint import FlintContext, clocks
from ts.flint import utils

In [23]:
l = clocks.uniform(flintContext, '30s', begin_date_time='2018-8-1 5:55:35', end_date_time='2018-08-01 05:59:05')
print(type(l))
l.show()

<class 'ts.flint.dataframe.TimeSeriesDataFrame'>
+-------------------+
|               time|
+-------------------+
|2018-08-01 11:25:35|
|2018-08-01 11:26:05|
|2018-08-01 11:26:35|
|2018-08-01 11:27:05|
|2018-08-01 11:27:35|
|2018-08-01 11:28:05|
|2018-08-01 11:28:35|
|2018-08-01 11:29:05|
+-------------------+



In [24]:
clocks.uniform(flintContext, '1d', 
               begin_date_time='2020-1-1', 
               end_date_time='2020-12-31').show()

+-------------------+
|               time|
+-------------------+
|2020-01-01 05:30:00|
|2020-01-02 05:30:00|
|2020-01-03 05:30:00|
|2020-01-04 05:30:00|
|2020-01-05 05:30:00|
|2020-01-06 05:30:00|
|2020-01-07 05:30:00|
|2020-01-08 05:30:00|
|2020-01-09 05:30:00|
|2020-01-10 05:30:00|
|2020-01-11 05:30:00|
|2020-01-12 05:30:00|
|2020-01-13 05:30:00|
|2020-01-14 05:30:00|
|2020-01-15 05:30:00|
|2020-01-16 05:30:00|
|2020-01-17 05:30:00|
|2020-01-18 05:30:00|
|2020-01-19 05:30:00|
|2020-01-20 05:30:00|
+-------------------+
only showing top 20 rows



In [20]:
df_decayed_return.show()

+-------------------+-------------------+-------------------+--------------------+
|               time|             return|    return_prev_day|return_prev_day_ewma|
+-------------------+-------------------+-------------------+--------------------+
|2000-07-18 00:00:00|-2.5641025641025643|  4.121212121212121|   4.121212121212121|
|2000-07-19 00:00:00|-0.3541912632821724|-2.5641025641025643| -0.5034965034965038|
|2000-07-20 00:00:00| 2.1151586368977675|-0.3541912632821724| -0.6059395150304243|
|2000-07-21 00:00:00|-0.9153318077803204| 2.1151586368977675|  1.8121888793825554|
|2000-07-24 00:00:00|-0.3460207612456747|-0.9153318077803204| -0.6888081978575009|
|2000-07-25 00:00:00|-0.8101851851851852|-0.3460207612456747| -0.6904248601744252|
|2000-07-26 00:00:00|-3.3603707995365006|-0.8101851851851852| -1.1714961001208826|
|2000-07-27 00:00:00|-0.2375296912114014|-3.3603707995365006| -3.9361028239559164|
|2000-07-28 00:00:00|-2.9761904761904763|-0.2375296912114014| -2.2041975435671635|
|200

In [21]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["return_prev_day", "return_prev_day_ewma"],
    outputCol="features")

output = assembler.transform(df_decayed_return).select('return', 'features').toDF('label', 'features')

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

model = lr.fit(output)

In [23]:
model.transform(output).show()

+-------------------+--------------------+--------------------+
|              label|            features|          prediction|
+-------------------+--------------------+--------------------+
|-2.5641025641025643|[4.12121212121212...|-0.04918970500553462|
|-0.3541912632821724|[-2.5641025641025...|-0.04918970500553462|
| 2.1151586368977675|[-0.3541912632821...|-0.04918970500553462|
|-0.9153318077803204|[2.11515863689776...|-0.04918970500553462|
|-0.3460207612456747|[-0.9153318077803...|-0.04918970500553462|
|-0.8101851851851852|[-0.3460207612456...|-0.04918970500553462|
|-3.3603707995365006|[-0.8101851851851...|-0.04918970500553462|
|-0.2375296912114014|[-3.3603707995365...|-0.04918970500553462|
|-2.9761904761904763|[-0.2375296912114...|-0.04918970500553462|
|0.12106537530266344|[-2.9761904761904...|-0.04918970500553462|
| 1.5643802647412757|[0.12106537530266...|-0.04918970500553462|
| -0.834326579261025|[1.56438026474127...|-0.04918970500553462|
|-1.8561484918793503|[-0.8343265792610..

In [29]:
df_return.addWindows(windows.past_absolute_time('5d')).show(10, False)

+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|time               |return             |window_past_5d                                                                                                                                                                                                              |
+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2000-07-17 00:00:00|4.121212121212121  |[[2000-07-17 00:00:00, 4.121212121212121]]                                                                                                                                