In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from google.cloud import storage
from pyspark.sql.functions import lit
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql.window import Window

In [5]:
spark = SparkSession.builder\
        .appName('Nifty50')\
        .config("spark.jars", "hdfs://maskxdc/test/flint-0.6.0.jar" )\
        .getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '30g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','30g'), ("spark.driver.maxResultSize", "70g")])

gcs_client = storage.Client()

In [6]:
bucket_name = 'bigdata_nifty50'

In [7]:
bucket = gcs_client.bucket(bucket_name)

In [8]:
file_name = 'combined.csv'

In [9]:
df = spark.read.csv('gs://{}//{}'.format(bucket_name, file_name), inferSchema=True)

                                                                                

In [10]:
new_columns = ['date',
 'close',
 'high',
 'low',
 'open',
 'volume',
 'sma5',
 'sma10',
 'sma15',
 'sma20',
 'ema5',
 'ema10',
 'ema15',
 'ema20',
 'upperband',
 'middleband',
 'lowerband',
 'HT_TRENDLINE',
 'KAMA10',
 'KAMA20',
 'KAMA30',
 'SAR',
 'TRIMA5',
 'TRIMA10',
 'TRIMA20',
 'ADX5',
 'ADX10',
 'ADX20',
 'APO',
 'CCI5',
 'CCI10',
 'CCI15',
 'macd510',
 'macd520',
 'macd1020',
 'macd1520',
 'macd1226',
 'MFI',
 'MOM10',
 'MOM15',
 'MOM20',
 'ROC5',
 'ROC10',
 'ROC20',
 'PPO',
 'RSI14',
 'RSI8',
 'slowk',
 'slowd',
 'fastk',
 'fastd',
 'fastksr',
 'fastdsr',
 'ULTOSC',
 'WILLR',
 'ATR',
 'Trange',
 'TYPPRICE',
 'HT_DCPERIOD',
 'BETA',
 'sector',
 'company']

In [11]:
from functools import reduce

old_columns = df.schema.names

df = reduce(lambda data, idx: data.withColumnRenamed(old_columns[idx], new_columns[idx]), range(len(old_columns)), df)

In [63]:
indicators_folder = 'Indicators'
indicators_file = 'Indicators.csv'
indicators_df = spark.read.csv('gs://{}//{}//{}'.format(bucket_name, indicators_folder, indicators_file), inferSchema=True, header=True)

In [66]:
stats = indicators_df.agg(*(
        avg(c).alias(c) for c in indicators_df.columns if c not in ['DateTime']
))
indicators_df = indicators_df.na.fill(stats.first().asDict())

In [71]:
from pyspark.sql.types import DateType
def get_month(date):
    return datetime.strptime('{}-{}'.format(date.year, date.month), '%Y-%m')
udf_get_month = udf(get_month, DateType())

In [72]:
combined_df = df.join(indicators_df, udf_get_month(df.date) == udf_get_month(indicators_df.DateTime), how='left_outer')

In [76]:
combined_df = combined_df.drop('DateTime')

In [None]:
combined_df.write.csv('gs://{}/{}'.format(bucket_name, 'df_with_indicators.csv'))

                                                                                