In [1]:
import sys
import os
sys.path.append(os.path.abspath('../scripts'))

In [2]:
import pandas as pd
import numpy as np
import json
import pickle
import data_preprocessing
import train_model
import predict_and_compare
import utils

def test_workflow():
    # 1. Preprocess Data
    data_preprocessing.preprocess_data()

    # 2. Train Model
    processed_data_path = os.path.abspath('../data/processed_bike_data.parquet')
    bike_data = pd.read_parquet(processed_data_path)
    stations = bike_data['stationcode'].unique()[:5]  # Use a subset for testing
    models, scaler, nearby_station_results = train_model.train_model(bike_data, stations)

    # Save and reload models and scalers for consistency
    model_file_path = os.path.abspath('../data/test_model.pkl')
    scaler_file_path = os.path.abspath('../data/test_scaler.pkl')
    with open(model_file_path, 'wb') as f:
        pickle.dump(models, f)
    with open(scaler_file_path, 'wb') as f:
        pickle.dump(scaler, f)

    with open(model_file_path, 'rb') as f:
        loaded_models = pickle.load(f)
    with open(scaler_file_path, 'rb') as f:
        loaded_scaler = pickle.load(f)

    # 3. Predict and Compare
    data_file_path = os.path.abspath('../data/use_for_predictions.json')
    with open(data_file_path, 'r') as f:
        current_data = json.load(f)
    current_bike_data = predict_and_compare.preprocess_current_data(current_data)
    current_bike_data = predict_and_compare.calculate_nearby_station_status(current_bike_data, limit=5)
    
    # Load feature names used during training
    with open('../data/training_feature_names.json', 'r') as f:
        feature_names = json.load(f)
    
    current_bike_data = predict_and_compare.normalize_features(current_bike_data, feature_names, loaded_scaler)
    results_df = predict_and_compare.make_predictions(current_bike_data, loaded_models, feature_names)
    predict_and_compare.save_results(results_df, current_bike_data)

    print("Test workflow completed successfully")


In [None]:
test_workflow()

Loading cleaned bike data...


Loading JSON files: 100%|█████████████████████████████████████████████████████████████████████████| 7/7 [01:19<00:00, 11.41s/it]


Preprocessing bike data...
Extracting latitude and longitude...
Stations with duplicate coordinates found:
Empty DataFrame
Columns: [stationcode, name, is_installed, capacity, numdocksavailable, numbikesavailable, mechanical, ebike, is_renting, is_returning, duedate, coordonnees_geo, nom_arrondissement_communes, date, lat, lon]
Index: []
Duplicate coordinate groups:
Empty DataFrame
Columns: [lat, lon, count]
Index: []
Duplicate coordinates data saved to /Users/anthonybellon/Comp_Documents/VelibVisualisation/data/duplicate_coordinates.json
Removing NaN values...
Creating features...
Feature names saved to /Users/anthonybellon/Comp_Documents/VelibVisualisation/data/feature_names.json
Data saved to /Users/anthonybellon/Comp_Documents/VelibVisualisation/data/processed_bike_data.parquet


2024-06-09 21:06:19,846 - INFO - Training models...


Creating features...


Training models:   0%|                                                                               | 0/5 [00:00<?, ?station/s]

Valid indices for station 10001: [1019, 7, 6, 1005, 1, 0, 1006, 10, 1004, 9]


2024-06-09 21:07:02,319 - INFO - Cross-Validation Score for station 10001: -0.0031164632755577874
2024-06-09 21:07:02,578 - INFO - Mean Squared Error for station 10001: 0.0006502646279965751
Training models:  20%|██████████████▏                                                        | 1/5 [00:12<00:51, 12.97s/station]

Valid indices for station 10001_relais: [1019, 7, 6, 1005, 1, 0, 10, 1004, 9, 11]


2024-06-09 21:07:59,721 - INFO - Cross-Validation Score for station 10001_relais: -6.863696860913937e-07
2024-06-09 21:08:02,225 - INFO - Mean Squared Error for station 10001_relais: 9.791210155841647e-09
Training models:  40%|████████████████████████████▍                                          | 2/5 [01:12<02:01, 40.43s/station]

Valid indices for station 10003: [716, 759, 764, 798, 1444, 40, 4, 758, 44, 2, 3]


2024-06-09 21:09:00,943 - INFO - Cross-Validation Score for station 10003: -5.782926033007414e-06
2024-06-09 21:09:04,588 - INFO - Mean Squared Error for station 10003: 2.1919276035797052e-07
Training models:  60%|██████████████████████████████████████████▌                            | 3/5 [02:14<01:40, 50.44s/station]

Valid indices for station 10004: [1012, 1019, 63, 758, 44, 2, 3, 7, 5, 19]


2024-06-09 21:10:06,128 - INFO - Cross-Validation Score for station 10004: -8.935428421769649e-06
2024-06-09 21:10:08,699 - INFO - Mean Squared Error for station 10004: 1.128944890171254e-05
Training models:  80%|████████████████████████████████████████████████████████▊              | 4/5 [03:19<00:55, 55.84s/station]

Out of bounds indices for station 10005: [1474]
Max valid index: 1460
Valid indices for station 10005: [764, 1444, 40, 1443, 4, 1441, 1434, 2, 19, 21]


2024-06-09 21:11:00,109 - INFO - Cross-Validation Score for station 10005: -1.0852655380635226e-05
2024-06-09 21:11:02,720 - INFO - Mean Squared Error for station 10005: 1.1752241334299238e-09
Training models: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [04:13<00:00, 50.62s/station]
  current_bike_data.fillna(0, inplace=True)
Calculating nearby station status:  60%|██████████████████████████████████▊                       | 3/5 [00:05<00:03,  1.94s/it]