In [1]:
!pip install pyyaml
import argparse
import yaml
import pandas as pd
import numpy as np
import json
import time
import logging
import os



In [2]:
import os

os.makedirs("mlops-task", exist_ok=True)
print("Folder created")

Folder created


In [3]:
os.chdir("mlops-task")
print(os.getcwd())

/content/mlops-task


In [4]:
files = ["run.py", "config.yaml", "requirements.txt", "Dockerfile", "README.md"]

for file in files:
    open(file, "w").close()

print("Files created")

Files created


In [5]:
import pandas as pd
import numpy as np

np.random.seed(42)
rows = 10000

data = {
    "timestamp": pd.date_range(start="2024-01-01", periods=rows, freq="T"),
    "open": np.random.uniform(100, 200, rows),
    "high": np.random.uniform(200, 300, rows),
    "low": np.random.uniform(50, 100, rows),
    "close": np.random.uniform(100, 200, rows),
    "volume_btc": np.random.uniform(1, 10, rows),
    "volume_usd": np.random.uniform(1000, 10000, rows)
}

df = pd.DataFrame(data)
df.to_csv("data.csv", index=False)

print("data.csv created")

data.csv created


  "timestamp": pd.date_range(start="2024-01-01", periods=rows, freq="T"),


In [6]:
config_content = """seed: 42
window: 5
version: "v1"
"""

with open("config.yaml", "w") as f:
    f.write(config_content)

print("config.yaml created!")

config.yaml created!


In [7]:
with open("config.yaml", "r") as f:
    print(f.read())

seed: 42
window: 5
version: "v1"



In [8]:


code = '''
import argparse
import yaml
import pandas as pd
import numpy as np
import json
import time
import logging
import os

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--input", required=True)
    parser.add_argument("--config", required=True)
    parser.add_argument("--output", required=True)
    parser.add_argument("--log-file", required=True)

    args = parser.parse_args()

    try:
        # Load config
        with open(args.config, "r") as f:
            config = yaml.safe_load(f)

        seed = config["seed"]
        window = config["window"]
        version = config["version"]

        # Set seed
        np.random.seed(seed)

        print("Config loaded successfully")
        print(f"Seed: {seed}, Window: {window}, Version: {version}")

    except Exception as e:
        print("Error loading config:", str(e))
        return

if __name__ == "__main__":
    main()
'''

with open("run.py", "w") as f:
    f.write(code)

print("Config loading added!")

Config loading added!


In [9]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Config loaded successfully
Seed: 42, Window: 5, Version: v1


In [10]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Config loaded successfully
Seed: 42, Window: 5, Version: v1


In [11]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
print(config)

{'seed': 42, 'window': 5, 'version': 'v1'}


In [12]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Config loaded successfully
Seed: 42, Window: 5, Version: v1


In [13]:
# data handling
code = '''
import argparse
import yaml
import pandas as pd
import numpy as np
import json
import time
import logging
import os

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--input", required=True)
    parser.add_argument("--config", required=True)
    parser.add_argument("--output", required=True)
    parser.add_argument("--log-file", required=True)

    args = parser.parse_args()

    try:
        # Load config
        with open(args.config, "r") as f:
            config = yaml.safe_load(f)

        seed = config["seed"]
        window = config["window"]
        version = config["version"]

        np.random.seed(seed)

        print("Config loaded successfully")

        # Load CSV
        if not os.path.exists(args.input):
            raise FileNotFoundError("Input file not found")

        df = pd.read_csv(args.input)

        # Validation
        if df.empty:
            raise ValueError("CSV file is empty")

        if "close" not in df.columns:
            raise ValueError("Missing 'close' column")

        print(f"Data loaded successfully: {len(df)} rows")

    except Exception as e:
        print("Error:", str(e))
        return

if __name__ == "__main__":
    main()
'''

with open("run.py", "w") as f:
    f.write(code)

print("CSV loading + validation added!")

CSV loading + validation added!


In [14]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Config loaded successfully
Data loaded successfully: 10000 rows


In [15]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Config loaded successfully
Data loaded successfully: 10000 rows


In [16]:
# Rolling Mean + Signal Generation

code = '''
import argparse
import yaml
import pandas as pd
import numpy as np
import json
import time
import logging
import os

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--input", required=True)
    parser.add_argument("--config", required=True)
    parser.add_argument("--output", required=True)
    parser.add_argument("--log-file", required=True)

    args = parser.parse_args()

    try:
        # Load config
        with open(args.config, "r") as f:
            config = yaml.safe_load(f)

        seed = config["seed"]
        window = config["window"]
        version = config["version"]

        np.random.seed(seed)

        print("Config loaded successfully")

        # Load CSV
        if not os.path.exists(args.input):
            raise FileNotFoundError("Input file not found")

        df = pd.read_csv(args.input)

        if df.empty:
            raise ValueError("CSV file is empty")

        if "close" not in df.columns:
            raise ValueError("Missing 'close' column")

        print(f"Data loaded successfully: {len(df)} rows")

        # Rolling Mean
        df["rolling_mean"] = df["close"].rolling(window=window).mean()

        # Signal Generation
        df["signal"] = (df["close"] > df["rolling_mean"]).astype(int)

        print("Rolling mean and signals generated")

    except Exception as e:
        print("Error:", str(e))
        return

if __name__ == "__main__":
    main()
'''

with open("run.py", "w") as f:
    f.write(code)

print("Rolling mean + signal added!")

Rolling mean + signal added!


In [17]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Config loaded successfully
Data loaded successfully: 10000 rows
Rolling mean and signals generated


In [18]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Config loaded successfully
Data loaded successfully: 10000 rows
Rolling mean and signals generated


In [19]:
# Metrics Calculation + JSON Output

code = '''
import argparse
import yaml
import pandas as pd
import numpy as np
import json
import time
import logging
import os

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--input", required=True)
    parser.add_argument("--config", required=True)
    parser.add_argument("--output", required=True)
    parser.add_argument("--log-file", required=True)

    args = parser.parse_args()

    start_time = time.time()

    try:
        # Load config
        with open(args.config, "r") as f:
            config = yaml.safe_load(f)

        seed = config["seed"]
        window = config["window"]
        version = config["version"]

        np.random.seed(seed)

        # Load CSV
        if not os.path.exists(args.input):
            raise FileNotFoundError("Input file not found")

        df = pd.read_csv(args.input)

        if df.empty:
            raise ValueError("CSV file is empty")

        if "close" not in df.columns:
            raise ValueError("Missing 'close' column")

        # Rolling Mean
        df["rolling_mean"] = df["close"].rolling(window=window).mean()

        # Signal
        df["signal"] = (df["close"] > df["rolling_mean"]).astype(int)

        # Metrics
        rows_processed = len(df)
        signal_rate = df["signal"].mean()

        latency_ms = int((time.time() - start_time) * 1000)

        # Output JSON
        output = {
            "version": version,
            "rows_processed": rows_processed,
            "metric": "signal_rate",
            "value": round(signal_rate, 4),
            "latency_ms": latency_ms,
            "seed": seed,
            "status": "success"
        }

        with open(args.output, "w") as f:
            json.dump(output, f, indent=4)

        print("Metrics saved successfully!")

    except Exception as e:
        error_output = {
            "version": "v1",
            "status": "error",
            "error_message": str(e)
        }

        with open(args.output, "w") as f:
            json.dump(error_output, f, indent=4)

        print("Error occurred:", str(e))

if __name__ == "__main__":
    main()
'''

with open("run.py", "w") as f:
    f.write(code)

print("Metrics + JSON added!")

Metrics + JSON added!


In [20]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Metrics saved successfully!


In [21]:
with open("metrics.json", "r") as f:
    print(f.read())

{
    "version": "v1",
    "rows_processed": 10000,
    "metric": "signal_rate",
    "value": 0.503,
    "latency_ms": 25,
    "seed": 42,
    "status": "success"
}


In [22]:
# Logging add

code = '''
import argparse
import yaml
import pandas as pd
import numpy as np
import json
import time
import logging
import os

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--input", required=True)
    parser.add_argument("--config", required=True)
    parser.add_argument("--output", required=True)
    parser.add_argument("--log-file", required=True)

    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        filename=args.log_file,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s"
    )

    start_time = time.time()

    try:
        logging.info("Job started")

        # Load config
        with open(args.config, "r") as f:
            config = yaml.safe_load(f)

        seed = config["seed"]
        window = config["window"]
        version = config["version"]

        logging.info(f"Config loaded: seed={seed}, window={window}, version={version}")

        np.random.seed(seed)

        # Load CSV
        if not os.path.exists(args.input):
            raise FileNotFoundError("Input file not found")

        df = pd.read_csv(args.input)

        if df.empty:
            raise ValueError("CSV file is empty")

        if "close" not in df.columns:
            raise ValueError("Missing 'close' column")

        logging.info(f"Data loaded: {len(df)} rows")

        # Rolling Mean
        df["rolling_mean"] = df["close"].rolling(window=window).mean()
        logging.info(f"Rolling mean calculated with window={window}")

        # Signal
        df["signal"] = (df["close"] > df["rolling_mean"]).astype(int)
        logging.info("Signals generated")

        # Metrics
        rows_processed = len(df)
        signal_rate = df["signal"].mean()

        latency_ms = int((time.time() - start_time) * 1000)

        logging.info(f"Metrics: signal_rate={round(signal_rate,4)}, rows_processed={rows_processed}")

        # Output JSON
        output = {
            "version": version,
            "rows_processed": rows_processed,
            "metric": "signal_rate",
            "value": round(signal_rate, 4),
            "latency_ms": latency_ms,
            "seed": seed,
            "status": "success"
        }

        with open(args.output, "w") as f:
            json.dump(output, f, indent=4)

        logging.info(f"Job completed successfully in {latency_ms} ms")

        print("Pipeline completed successfully!")

    except Exception as e:
        logging.error(str(e))

        error_output = {
            "version": "v1",
            "status": "error",
            "error_message": str(e)
        }

        with open(args.output, "w") as f:
            json.dump(error_output, f, indent=4)

        print("Error occurred:", str(e))

if __name__ == "__main__":
    main()
'''

with open("run.py", "w") as f:
    f.write(code)

print("Logging added!")

Logging added!


In [23]:
!python run.py --input data.csv --config config.yaml --output metrics.json --log-file run.log

Pipeline completed successfully!


In [24]:
with open("run.log", "r") as f:
    print(f.read())

2026-02-21 18:27:47,970 - INFO - Job started
2026-02-21 18:27:47,971 - INFO - Config loaded: seed=42, window=5, version=v1
2026-02-21 18:27:48,007 - INFO - Data loaded: 10000 rows
2026-02-21 18:27:48,009 - INFO - Rolling mean calculated with window=5
2026-02-21 18:27:48,010 - INFO - Signals generated
2026-02-21 18:27:48,010 - INFO - Metrics: signal_rate=0.503, rows_processed=10000
2026-02-21 18:27:48,011 - INFO - Job completed successfully in 40 ms



In [25]:
# requirements.txt

req = """pandas
numpy
pyyaml
"""

with open("requirements.txt", "w") as f:
    f.write(req)

print("requirements.txt created!")

requirements.txt created!


In [26]:
# Dockerfile

docker_content = """FROM python:3.9-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY run.py .
COPY config.yaml .
COPY data.csv .

CMD ["python", "run.py", "--input", "data.csv", "--config", "config.yaml", "--output", "metrics.json", "--log-file", "run.log"]
"""

with open("Dockerfile", "w") as f:
    f.write(docker_content)

print("Dockerfile created!")

Dockerfile created!


In [27]:
!zip -r mlops-task.zip .

  adding: run.py (deflated 64%)
  adding: run.log (deflated 50%)
  adding: data.csv (deflated 55%)
  adding: Dockerfile (deflated 36%)
  adding: requirements.txt (stored 0%)
  adding: config.yaml (stored 0%)
  adding: README.md (stored 0%)
  adding: metrics.json (deflated 32%)


In [28]:
from google.colab import files
files.download("mlops-task.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>