In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, hour, minute, sin, cos, radians
from pyspark.ml.feature import StandardScaler, MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as F
import os

# A function to run commands
def run(command):
    return os.popen(command).read()


team = 29

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()


table_name = "team29_projectdb.dataset_prepared_for_modeling"
df = spark.read.table(table_name)


print("Shape of DataFrame: ({}, {})".format(df.count(), len(df.columns)))

Shape of DataFrame: (771456, 39)


In [2]:
df.printSchema()

root
 |-- 3: integer (nullable = true)
 |-- 4: integer (nullable = true)
 |-- 8: integer (nullable = true)
 |-- 6: integer (nullable = true)
 |-- 5: integer (nullable = true)
 |-- 7: integer (nullable = true)
 |-- 1: integer (nullable = true)
 |-- 2: integer (nullable = true)
 |-- total_snow: decimal(10,5) (nullable = true)
 |-- sun_hour_Scaled: double (nullable = true)
 |-- uv_index_1_Scaled: double (nullable = true)
 |-- uv_index_2_Scaled: double (nullable = true)
 |-- moon_illumunation_Scaled: double (nullable = true)
 |-- dew_point_Scaled: double (nullable = true)
 |-- wind_gust_Scaled: double (nullable = true)
 |-- cloudcover_Scaled: double (nullable = true)
 |-- humidity_Scaled: double (nullable = true)
 |-- visibility_Scaled: double (nullable = true)
 |-- wind_speed_Scaled: double (nullable = true)
 |-- max_temp_Scaled: double (nullable = true)
 |-- min_temp_Scaled: double (nullable = true)
 |-- feels_like_Scaled: double (nullable = true)
 |-- heat_index_Scaled: double (nullable

In [3]:
CITY_TO_PREDICT = 1


# Get only rows with column "1" is equal to 1
city_df = df.filter(col(f"{CITY_TO_PREDICT}") == CITY_TO_PREDICT)

# Drop the city encoded columns
city_df = city_df.drop("1", "2", "3", "4", "5", "6", "7", "8")

# Create an ID column
city_df = city_df.withColumn("ID", F.monotonically_increasing_id())

# show the shape
print("Shape of DataFrame: ({}, {})".format(city_df.count(), len(city_df.columns)))

Shape of DataFrame: (96432, 32)


In [4]:
# create target and feature dataframe

# Target dataframe is all except first row
city_target = city_df.filter(city_df.ID != 0)
# Update ID to start from zero (id - 1)
city_target = city_target.withColumn("ID", city_target["ID"] - 1)
# Add "target" to all column names except "ID"
city_target = city_target.toDF(*["target_" + c if c != "ID" else c for c in city_target.columns])

# feature dataframe is all except last row
city_feature = city_df.filter(city_df.ID != city_df.count() - 1)
# Add "feature" to all column names except "ID"
city_feature = city_feature.toDF(*["feature_" + c if c != "ID" else c for c in city_feature.columns])

# show the shapes are similar
print("Shape of target DataFrame: ({}, {})".format(city_target.count(), len(city_target.columns)))
print("Shape of feature DataFrame: ({}, {})".format(city_feature.count(), len(city_feature.columns)))

Shape of target DataFrame: (96432, 32)
Shape of feature DataFrame: (96432, 32)


In [5]:
# Merge into a single dataframe on "ID"
city_merged = city_target.join(city_feature, "ID", "inner")

# Show the shape
print("Shape of DataFrame: ({}, {})".format(city_merged.count(), len(city_merged.columns)))

Shape of DataFrame: (96431, 63)


# Define ensamble class model

In [6]:
from pyspark.ml.regression import GBTRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from tqdm import tqdm



class Multitarget_Model:
    def __init__(self, feature_columns, target_columns, model_type):
        self.feature_columns = feature_columns
        self.target_columns = target_columns
        self.models = []
        self.best_model = None
        self.model_type = model_type


    def fit(self, df, cv=False, cv_folds=3):
        for target in tqdm(self.target_columns, desc="Training...", unit="model"):
            # Define the assembler
            assembler = VectorAssembler(inputCols=self.feature_columns, outputCol="features")

            # Define the primary model
            if self.model_type == "GBT":
                primary_model = GBTRegressor(featuresCol="features", labelCol=target, maxIter=10)
            elif self.model_type == "RF":
                primary_model = RandomForestRegressor(featuresCol="features", labelCol=target)
            else:
                raise ValueError("Model type not supported")

            # Define the pipeline
            pipeline = Pipeline(stages=[assembler, primary_model])


            if cv:
                # Define the evaluator
                evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse")

                # Define the parameter grid
                if self.model_type == "GBT":
                    paramGrid = ParamGridBuilder() \
                        .addGrid(primary_model.maxDepth, [1, 3]) \
                        .addGrid(primary_model.maxBins, [5, 10]) \
                        .build()
                elif self.model_type == "RF":
                    paramGrid = ParamGridBuilder() \
                        .addGrid(primary_model.maxDepth, [1, 3]) \
                        .addGrid(primary_model.numTrees, [5, 10]) \
                        .build()
                else:
                    raise ValueError("Model type not supported")

                # Define the cross-validator
                crossval = CrossValidator(estimator=pipeline,
                                        estimatorParamMaps=paramGrid,
                                        evaluator=evaluator,
                                        numFolds=cv_folds)

                # Fit the model
                model = crossval.fit(df)

            else:
                # Fit the model
                model = pipeline.fit(df)


            # Append the model to the list of models
            self.models.append(model)


    def evaluate(self, df, metric):

        # Create a dictionary to store the results and fill it with zeros
        results = {target: 0 for target in self.target_columns}

        for i, model in tqdm(enumerate(self.models), desc="Evaluating...", unit="model"):
            # Get the target column
            target_col = self.target_columns[i]

            # Get the predictions
            predictions = model.transform(df)

            # Get the evaluator
            evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName=metric)

            # Evaluate the model
            result = evaluator.evaluate(predictions)

            # Store the result
            results[target_col] = result

        return results


    def predict(self, df):
        # Create an empty dataframe to store the predictions
        predictions = None

        for i, model in tqdm(enumerate(self.models), desc="Predicting... ", unit="model"):

            # Get the predictions
            prediction = model.transform(df)

            # Select only the prediction column
            prediction = prediction.select("ID", "prediction")

            # Rename the prediction column to the target column
            name = self.target_columns[i].replace("target_", "prediction_")
            prediction = prediction.withColumnRenamed("prediction", name)

            # If the predictions dataframe is empty, assign the prediction dataframe to it
            if predictions is None:
                predictions = prediction

            # Otherwise, join the prediction dataframe with the predictions dataframe
            else:
                predictions = predictions.join(prediction, "ID", "inner")

        return predictions


    def save(self):
        # Save the models in location like project/models/model1/
        for i, model in enumerate(self.models):
            model.write().overwrite().save(f"project/models/{self.model_type}/model_{i}")

            # Run it from root directory of the repository
            run(f"hdfs dfs -get project/models/{self.model_type}/model_{i} models/{self.model_type}/model_{i}")

# Split Data

In [7]:
# Define the feature and target columns
feature_columns = [c for c in city_merged.columns if c.startswith("feature_")]
target_columns = [c for c in city_merged.columns if c.startswith("target_")]

In [8]:
# split the data
(train, test) = city_merged.randomSplit([0.8, 0.2], seed=10)

train.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/train")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/train/*.json > data/train.json")

test.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/test")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/test/*.json > data/test.json")

''

# GBT
### Train and evaluate WITHOUT grid search

In [9]:
# Initialize the model
model = Multitarget_Model(feature_columns, target_columns, model_type="GBT")

# Fit the model
model.fit(train, cv=False)

Training...:   3%|▎         | 1/31 [00:06<03:22,  6.76s/model]ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.6/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib64/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
Training...:   3%|▎         | 1/31 [00:10<05:01, 10.04s/model]


KeyboardInterrupt: 

In [9]:
# Evaluate the model
results = model.evaluate(test, "rmse")

# Print the results
for target, result in results.items():
    print("RMSE for {}: {}".format(target, result))

Evaluating...: 31model [00:36,  1.19s/model]

RMSE for target_total_snow: 0.0
RMSE for target_sun_hour_Scaled: 0.026182026487622724
RMSE for target_uv_index_1_Scaled: 0.01260305811759245
RMSE for target_uv_index_2_Scaled: 0.04214502104853706
RMSE for target_moon_illumunation_Scaled: 0.016505625942626733
RMSE for target_dew_point_Scaled: 0.010921582415567522
RMSE for target_wind_gust_Scaled: 0.018378390356788513
RMSE for target_cloudcover_Scaled: 0.061528338577819856
RMSE for target_humidity_Scaled: 0.029466735717109137
RMSE for target_visibility_Scaled: 0.02925380377675453
RMSE for target_wind_speed_Scaled: 0.023030548146921806
RMSE for target_max_temp_Scaled: 0.05855009417596629
RMSE for target_min_temp_Scaled: 0.03277565515907261
RMSE for target_feels_like_Scaled: 0.10329420392968482
RMSE for target_heat_index_Scaled: 0.09341461777778245
RMSE for target_wind_chill_Scaled: 0.108053689044366
RMSE for target_precip_Scaled: 0.48386827359314555
RMSE for target_pressure_Scaled: 0.08299056304129726
RMSE for target_temp_Scaled: 0.106968




In [10]:
# Evaluate the model
results = model.evaluate(test, "r2")

# Print the results
for target, result in results.items():
    print("R2 for {}: {}".format(target, result))

Evaluating...: 31model [00:36,  1.19s/model]

R2 for target_total_snow: nan
R2 for target_sun_hour_Scaled: 0.9820915322735176
R2 for target_uv_index_1_Scaled: 0.9859657542958703
R2 for target_uv_index_2_Scaled: 0.9783897638160198
R2 for target_moon_illumunation_Scaled: 0.9972236301623096
R2 for target_dew_point_Scaled: 0.9804318620412646
R2 for target_wind_gust_Scaled: 0.955147959081337
R2 for target_cloudcover_Scaled: 0.9528700538471728
R2 for target_humidity_Scaled: 0.9824230797699031
R2 for target_visibility_Scaled: 0.9297250397436234
R2 for target_wind_speed_Scaled: 0.9636382103017538
R2 for target_max_temp_Scaled: 0.9910085205402965
R2 for target_min_temp_Scaled: 0.9951687710194171
R2 for target_feels_like_Scaled: 0.9720530041780374
R2 for target_heat_index_Scaled: 0.9765172901041342
R2 for target_wind_chill_Scaled: 0.9750301829596854
R2 for target_precip_Scaled: 0.7067012846033601
R2 for target_pressure_Scaled: 0.9770583518294219
R2 for target_temp_Scaled: 0.9747707142435412
R2 for target_wind_dir_sin: 0.9688641409438953
R2 




### Train and evaluate WITH grid search

In [11]:
# Initialize the model
model = Multitarget_Model(feature_columns, target_columns, model_type="GBT")

# Fit the model
model.fit(train, cv=True, cv_folds=2)

Training...: 100%|██████████| 31/31 [14:14<00:00, 27.58s/model]


In [12]:
# Evaluate the model
rmse_GBT = model.evaluate(test, "rmse")

# Print the results
for target, result in rmse_GBT.items():
    print("RMSE for {}: {}".format(target, result))

Evaluating...: 31model [00:35,  1.15s/model]

RMSE for target_total_snow: 0.0
RMSE for target_sun_hour_Scaled: 0.03883539025395263
RMSE for target_uv_index_1_Scaled: 0.013874858665257646
RMSE for target_uv_index_2_Scaled: 0.07710357196400472
RMSE for target_moon_illumunation_Scaled: 0.03387849735699749
RMSE for target_dew_point_Scaled: 0.018241179174760775
RMSE for target_wind_gust_Scaled: 0.024705933403559553
RMSE for target_cloudcover_Scaled: 0.07064991325281066
RMSE for target_humidity_Scaled: 0.03900802438075356
RMSE for target_visibility_Scaled: 0.04039451159840566
RMSE for target_wind_speed_Scaled: 0.033386858250770375
RMSE for target_max_temp_Scaled: 0.1108155217598517
RMSE for target_min_temp_Scaled: 0.07361221294920577
RMSE for target_feels_like_Scaled: 0.14936672106242377
RMSE for target_heat_index_Scaled: 0.1458979227638074
RMSE for target_wind_chill_Scaled: 0.14848397031248764
RMSE for target_precip_Scaled: 0.6233701209535754
RMSE for target_pressure_Scaled: 0.12024387778230382
RMSE for target_temp_Scaled: 0.1453314535




In [13]:
# Evaluate the model
r2_GBT = model.evaluate(test, "r2")

# Print the results
for target, result in r2_GBT.items():
    print("R2 for {}: {}".format(target, result))

Evaluating...: 31model [00:33,  1.08s/model]

R2 for target_total_snow: nan
R2 for target_sun_hour_Scaled: 0.9605989985627245
R2 for target_uv_index_1_Scaled: 0.9829903908832214
R2 for target_uv_index_2_Scaled: 0.9276704173310971
R2 for target_moon_illumunation_Scaled: 0.9883033471889678
R2 for target_dew_point_Scaled: 0.945413633684162
R2 for target_wind_gust_Scaled: 0.9189468683909119
R2 for target_cloudcover_Scaled: 0.9378602025830505
R2 for target_humidity_Scaled: 0.9691974524593527
R2 for target_visibility_Scaled: 0.8660074526074779
R2 for target_wind_speed_Scaled: 0.9235833829037668
R2 for target_max_temp_Scaled: 0.9677910066430013
R2 for target_min_temp_Scaled: 0.9756300517996387
R2 for target_feels_like_Scaled: 0.941562595459364
R2 for target_heat_index_Scaled: 0.9427181879666872
R2 for target_wind_chill_Scaled: 0.9528485224831669
R2 for target_precip_Scaled: 0.5132030698952297
R2 for target_pressure_Scaled: 0.9518392612382103
R2 for target_temp_Scaled: 0.9534295163859264
R2 for target_wind_dir_sin: 0.9598965607846011
R2 f




In [15]:
# Save the best model
model.save()

In [16]:
# Predict the test data and save it into csv
predictions = model.predict(test)

# merge predictions with test data
results = test.join(predictions, "ID", "inner")

# Drop everything except the target and prediction columns
results = results.select([c for c in results.columns if c.startswith("target_") or c.startswith("prediction_")])

# Save into project/output/GBT_predictions on HDFS (as a one partition)
results.coalesce(1).write.mode("overwrite").format("csv").option("sep", ",").option("header","true").save("project/output/GBT_predictions.csv")

run("hdfs dfs -cat project/output/GBT_predictions.csv/*.csv > output/GBT_predictions.csv")

Predicting... : 31model [00:07,  3.96model/s]


# RF
### Train and evaluate WITHOUT grid search

In [17]:
# Initialize the model
model = Multitarget_Model(feature_columns, target_columns, model_type="RF")

# Fit the model
model.fit(train, cv=False)

Training...: 100%|██████████| 31/31 [03:02<00:00,  5.89s/model]


In [18]:
# Evaluate the model
results = model.evaluate(test, "rmse")

# Print the results
for target, result in results.items():
    print("RMSE for {}: {}".format(target, result))

Evaluating...: 31model [00:38,  1.25s/model]

RMSE for target_total_snow: 0.0
RMSE for target_sun_hour_Scaled: 0.040916917957279635
RMSE for target_uv_index_1_Scaled: 0.02265721860837882
RMSE for target_uv_index_2_Scaled: 0.062193370185898364
RMSE for target_moon_illumunation_Scaled: 0.10054874935131365
RMSE for target_dew_point_Scaled: 0.016237745523154913
RMSE for target_wind_gust_Scaled: 0.022048847515149322
RMSE for target_cloudcover_Scaled: 0.07545785309091008
RMSE for target_humidity_Scaled: 0.042016179957568164
RMSE for target_visibility_Scaled: 0.037695924559538754
RMSE for target_wind_speed_Scaled: 0.030341742869220287
RMSE for target_max_temp_Scaled: 0.12922697629615454
RMSE for target_min_temp_Scaled: 0.08162200944318304
RMSE for target_feels_like_Scaled: 0.12059893108809781
RMSE for target_heat_index_Scaled: 0.11745325799083979
RMSE for target_wind_chill_Scaled: 0.12767866972589653
RMSE for target_precip_Scaled: 0.5469272636194055
RMSE for target_pressure_Scaled: 0.13114776787448218
RMSE for target_temp_Scaled: 0.12273




In [19]:
# Evaluate the model
results = model.evaluate(test, "r2")

# Print the results
for target, result in results.items():
    print("R2 for {}: {}".format(target, result))

Evaluating...: 31model [00:38,  1.23s/model]

R2 for target_total_snow: nan
R2 for target_sun_hour_Scaled: 0.9562621189222207
R2 for target_uv_index_1_Scaled: 0.9546423664807323
R2 for target_uv_index_2_Scaled: 0.9529396507378305
R2 for target_moon_illumunation_Scaled: 0.8969692962250133
R2 for target_dew_point_Scaled: 0.9567456461628705
R2 for target_wind_gust_Scaled: 0.9354436371022732
R2 for target_cloudcover_Scaled: 0.9291148183878689
R2 for target_humidity_Scaled: 0.9642635132509563
R2 for target_visibility_Scaled: 0.8833123966517022
R2 for target_wind_speed_Scaled: 0.9368871563310637
R2 for target_max_temp_Scaled: 0.9561991714437489
R2 for target_min_temp_Scaled: 0.9700380949562557
R2 for target_feels_like_Scaled: 0.9619048102507867
R2 for target_heat_index_Scaled: 0.9628765212087964
R2 for target_wind_chill_Scaled: 0.9651363493429739
R2 for target_precip_Scaled: 0.625272977307328
R2 for target_pressure_Scaled: 0.9427086574098622
R2 for target_temp_Scaled: 0.9667862153353415
R2 for target_wind_dir_sin: 0.9638758633944571
R2 




### Train and evaluate WITH grid search

In [20]:
# Initialize the model
model = Multitarget_Model(feature_columns, target_columns, model_type="RF")

# Fit the model
model.fit(train, cv=True, cv_folds=2)

Training...: 100%|██████████| 31/31 [09:36<00:00, 18.61s/model]


In [21]:
# Evaluate the model
rmse_RF = model.evaluate(test, "rmse")

# Print the results
for target, result in rmse_RF.items():
    print("RMSE for {}: {}".format(target, result))

Evaluating...: 31model [00:36,  1.18s/model]

RMSE for target_total_snow: 0.0
RMSE for target_sun_hour_Scaled: 0.06965649453854132
RMSE for target_uv_index_1_Scaled: 0.03922210260957141
RMSE for target_uv_index_2_Scaled: 0.09888785214952549
RMSE for target_moon_illumunation_Scaled: 0.11229055035640503
RMSE for target_dew_point_Scaled: 0.02248910846735099
RMSE for target_wind_gust_Scaled: 0.032308071346283815
RMSE for target_cloudcover_Scaled: 0.10079624282612702
RMSE for target_humidity_Scaled: 0.06180274447607928
RMSE for target_visibility_Scaled: 0.05042754313608699
RMSE for target_wind_speed_Scaled: 0.04681577997064773
RMSE for target_max_temp_Scaled: 0.20581562315279744
RMSE for target_min_temp_Scaled: 0.14255571285733232
RMSE for target_feels_like_Scaled: 0.16246067059308802
RMSE for target_heat_index_Scaled: 0.16677519140897717
RMSE for target_wind_chill_Scaled: 0.18518804445408454
RMSE for target_precip_Scaled: 0.6283612369601856
RMSE for target_pressure_Scaled: 0.17824951745845521
RMSE for target_temp_Scaled: 0.17374387892




In [22]:
# Evaluate the model
r2_RF = model.evaluate(test, "r2")

# Print the results
for target, result in r2_RF.items():
    print("R2 for {}: {}".format(target, result))

Evaluating...: 31model [00:39,  1.28s/model]

R2 for target_total_snow: nan
R2 for target_sun_hour_Scaled: 0.8732420688194871
R2 for target_uv_index_1_Scaled: 0.8640751254765312
R2 for target_uv_index_2_Scaled: 0.8810257840448485
R2 for target_moon_illumunation_Scaled: 0.8715010020642118
R2 for target_dew_point_Scaled: 0.917029664724578
R2 for target_wind_gust_Scaled: 0.8613916694253341
R2 for target_cloudcover_Scaled: 0.8735160978845073
R2 for target_humidity_Scaled: 0.9226795721955364
R2 for target_visibility_Scaled: 0.7911802652340424
R2 for target_wind_speed_Scaled: 0.8497476656600063
R2 for target_max_temp_Scaled: 0.8888952472542782
R2 for target_min_temp_Scaled: 0.9086046477994616
R2 for target_feels_like_Scaled: 0.9308679067343485
R2 for target_heat_index_Scaled: 0.9251517887072098
R2 for target_wind_chill_Scaled: 0.9266564285214425
R2 for target_precip_Scaled: 0.5053766226370127
R2 for target_pressure_Scaled: 0.8941663147941663
R2 for target_temp_Scaled: 0.9334404250265126
R2 for target_wind_dir_sin: 0.9525422126646528
R2 




In [23]:
# Save the best model
model.save()

In [24]:
# Predict the test data and save it into csv
predictions = model.predict(test)

# merge predictions with test data
results = test.join(predictions, "ID", "inner")

# Drop everything except the target and prediction columns
results = results.select([c for c in results.columns if c.startswith("target_") or c.startswith("prediction_")])

# Save into project/output/GBT_predictions on HDFS (as a one partition)
results.coalesce(1).write.mode("overwrite").format("csv").option("sep", ",").option("header","true").save("project/output/RF_predictions.csv")

run("hdfs dfs -cat project/output/RF_predictions.csv/*.csv > output/RF_predictions.csv")

Predicting... : 31model [00:08,  3.87model/s]


In [27]:
import pandas as pd

# The format of table is: target | RMSE (GBT) | R2 (GBT) | RMSE (RF) | R2 (RF)
table = pd.DataFrame(columns=["target", "RMSE (GBT)", "R2 (GBT)", "RMSE (RF)", "R2 (RF)"])

for target in rmse_GBT.keys():
    table = table.append({
        "target": target.replace("target_", ""),
        "RMSE (GBT)": rmse_GBT[target],
        "R2 (GBT)": r2_GBT[target],
        "RMSE (RF)": rmse_RF[target],
        "R2 (RF)": r2_RF[target]
    }, ignore_index=True)
    
table = table.fillna(1)

# Convert to spark
spark_table = spark.createDataFrame(table)

# Save it to HDFS
spark_table.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/evaluation.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/evaluation.csv/*.csv > output/evaluation.csv")