## Predicting CRS scores
We now generate a pipeline that given a specific date, will forecast an estimated CRS cutoff value.

In [34]:
import os
import ast
import pickle
import mlflow
import pandas as pd
from mlflow.tracking import MlflowClient

from utils import splitting as splt
from utils import preprocessing as prep

In [35]:
query_date = '31-Jul-2024'

In [36]:
url = 'https://www.canada.ca/content/dam/ircc/documents/json/ee_rounds_123_en.json'
df = prep.create_df_from_website(url)
df = prep.cleanup_df_general_rounds(df)

# retrieve all the data prior to the query date
df_sub = df.loc[df.index <= query_date].copy()


In [37]:
# create a new row just for prediction
df_query = df[:0].copy()
df_query.loc[query_date,'round type'] = 'General'

# set up dummy data
df_query.loc[query_date,'invitations issued'] = 0
df_query.loc[query_date,'CRS cutoff'] = 0


In [38]:
# concatenate to older data
df = pd.concat([df_query, df_sub])

In [39]:
# Calculate independent vars
df = prep.calculate_independent_vars(df)

In [40]:
#retain only top query
df = df[:1]

In [41]:
# get model
mlflow.set_tracking_uri("sqlite:///mlflow.db")
client = MlflowClient()


In [42]:
name = "CRS_Model"
mymodel = client.get_registered_model(name)
mymodel

<RegisteredModel: aliases={}, creation_timestamp=1721779326827, description=None, last_updated_timestamp=1721779971269, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1721779326832, current_stage='Archived', description=None, last_updated_timestamp=1721779971269, name='CRS_Model', run_id='95d5d96e6ef144078192f1c8a248eec3', run_link=None, source=('/Users/andreanicolas/Library/CloudStorage/GoogleDrive-andreanicolas91@gmail.com/My '
 'Drive/ASU_ComputerScience/CRScanda_MLOps/mlruns/1/95d5d96e6ef144078192f1c8a248eec3/artifacts/model'), status='READY', status_message=None, tags={}, user_id=None, version=1>,
 <ModelVersion: aliases=[], creation_timestamp=1721779971259, current_stage='Production', description=None, last_updated_timestamp=1721779971269, name='CRS_Model', run_id='24b3c7fbc4dc409499f4641c720a2b82', run_link=None, source=('/Users/andreanicolas/Library/CloudStorage/GoogleDrive-andreanicolas91@gmail.com/My '
 'Drive/ASU_ComputerScience/CRScanda_MLOps/mlruns/1/24b3c7

In [43]:
# get the run id for the production model
for lv in mymodel.latest_versions:
    stage = lv.current_stage
    if stage == 'Production':
        run_id = lv.run_id
        print(run_id)



24b3c7fbc4dc409499f4641c720a2b82


In [44]:
# Get the details of the run
run_info = client.get_run(run_id)

# Retrieve the x_label from parameters
params = run_info.data.params
x_labels = ast.literal_eval(params['x_labels'])

# retrieve the label from the tags
tags = run_info.data.tags
model_type = tags['model_type']

In [45]:
# get only queried time 
X_vals, _ = splt.create_features(df,x_labels=x_labels,y_labels=['CRS cutoff'])

In [46]:
model_filename = f'./models/{model_type}_model.bin'
local_path = client.download_artifacts(run_id, model_filename,'./')

# Load the pickle model
with open(model_filename, "rb") as f:
    model = pickle.load(f)


In [47]:
# evaluate model given these X vals
y_pred = model.predict(X_vals)[0][0]


In [48]:
# creating the final results df
df_result = pd.DataFrame()
df_result['date'] = [query_date]
df_result['round type'] = ['General']
df_result['CRS cutoff predicted'] = [y_pred]
df_result


Unnamed: 0,date,round type,CRS cutoff predicted
0,31-Jul-2024,General,535.486231


In [49]:

# saving as parquet fil
output_file=f'./outputs/predicted_CRS_{query_date}.parquet'
# create any directories if does not exist
os.makedirs(os.path.dirname(output_file), exist_ok=True)

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)
