set -e
set -u
# pyspark needs numpy
echo -n "Installing numpy .. "
apt-get update
apt-get -y install python3-pip tmux htop
pip3 install numpy
echo "Done."
# configure spark to use python3.6
# ref:
echo "export PYSPARK_PYTHON=python3" | tee -a /etc/profile.d/ /etc/*bashrc /usr/lib/spark/conf/
echo "export PYTHONHASHSEED=0" | tee -a /etc/profile.d/ /etc/*bashrc /usr/lib/spark/conf/
echo "spark.executorEnv.PYTHONHASHSEED=0" >> /etc/spark/conf/spark-defaults.conf
echo -n "Setting up the prediction project .. "
git clone
mv taxi-demand-prediction /home/atkm
# Spark jobs should get data from gs://.
#gsutil cp gs://nyc-taxi-8472/yellow_tripdata_2014-{01..12}_tiny.csv ./data/
#gsutil cp gs://nyc-taxi-8472/lga_2014-{01..12}.csv ./data/
echo "Done."