## Predicting CRS scores
We now generate a pipeline that given a specific date, will forecast an estimated CRS cutoff value.

In [50]:
import os
import ast
import pickle
import mlflow
import pandas as pd
from mlflow.tracking import MlflowClient

from utils import splitting as splt
from utils import preprocessing as prep
from utils import registering as regs

# setup MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [51]:
def preprocess_query_date(query_date):
    ''' 
    This function converts the query date into a df alongside the X vals that are usually calculated during training
    '''
    url = 'https://www.canada.ca/content/dam/ircc/documents/json/ee_rounds_123_en.json'
    df = prep.create_df_from_website(url)
    df = prep.cleanup_df_general_rounds(df)

    # retrieve all the data prior to the query date
    df_sub = df.loc[df.index <= query_date].copy()

    # create a new row just for prediction
    df_query = df[:0].copy()
    df_query.loc[query_date,'round type'] = 'General'
    df_query.loc[query_date,'invitations issued'] = 0 #dummy data
    df_query.loc[query_date,'CRS cutoff'] = 0 #dummy data 

    # concatenate to older data
    df = pd.concat([df_query, df_sub])

    # Calculate independent vars
    df = prep.calculate_independent_vars(df)

    #retain only top query
    return df[:1]


In [52]:
def get_prod_info_from_registry(reg_model_name= "CRS_Model"):
    # get registered model
    client = MlflowClient()
    mymodel = client.get_registered_model(name= reg_model_name)

    # get run Id of the production model in the register
    for lv in mymodel.latest_versions:
        stage = lv.current_stage
        if stage == 'Production':
            run_id = lv.run_id

    # Get the details of the run
    run_info = client.get_run(run_id)

    # Retrieve the x_label from parameters
    params = run_info.data.params
    x_labels = ast.literal_eval(params['x_labels'])

    # retrieve the label from the tags
    tags = run_info.data.tags
    model_type = tags['model_type']

    model = regs.load_model_from_mlflow(run_id=run_id,model_type=model_type)

    return model, x_labels, model_type, run_id


In [53]:

def main_run(query_date):
    ''' 
    this is the main function to run during a prediction
    '''
    # establish the query date (to be replaced by a json input later)
    #query_date = '31-Jul-2024'
    # calculate all the usual x_labels
    df_query = preprocess_query_date(query_date)
    # get model and the labels used to calculate it
    model, x_labels, _, _ = get_prod_info_from_registry(reg_model_name= "CRS_Model")
    # get only x labels at the queried time
    X_vals = df_query[x_labels]
    #predict the CRS cutoff
    y_pred = model.predict(X_vals)[0][0]

    return y_pred



In [54]:
# run the main script see what happens
query_date = '31-Jul-2024'
y_pred =main_run(query_date)
print(f'for date {query_date} the predicted crs score is {y_pred:.0f}')

for date 31-Jul-2024 the predicted crs score is 535


### Finish
This notebook can now be exported as a python script via: 
```bash
jupyter nbconvert --to script predict.ipynb
```

In [55]:
# # creating the final results df
# df_result = pd.DataFrame()
# df_result['date'] = [query_date]
# df_result['round type'] = ['General']
# df_result['CRS cutoff predicted'] = [y_pred]
# df_result


In [56]:

# # saving as parquet fil
# output_file=f'./outputs/predicted_CRS_{query_date}.parquet'
# # create any directories if does not exist
# os.makedirs(os.path.dirname(output_file), exist_ok=True)

# df_result.to_parquet(
#     output_file,
#     engine='pyarrow',
#     compression=None,
#     index=False
# )
