# Installing packages section

In [None]:
!pip install --upgrade pip
!pip install "snowflake-connector-python[pandas]" "snowflake-snowpark-python[pandas]" snowflake-snowpark-python==1.9.0 numpy pandas matplotlib scikit-learn xgboost seaborn python-dateutil tqdm holidays faker
!pip install --upgrade --q snowflake-snowpark-python==1.9.0
!pip uninstall urllib3 -y
!pip install urllib3==1.26.15
!pip install fosforml==1.1.6

# Importing packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import joblib
# from fosforio import snowflake
from fosforml import *
from fosforml.constants import MLModelFlavours
# from fosforio import get_dataframe
from matplotlib import pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)
import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import warnings; warnings.simplefilter('ignore')
from joblib import dump, load
import requests
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import configparser
from dateutil.relativedelta import relativedelta
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

# Fetching the datasets

--OLD code: DO NOT RUN
from fosforio import snowflake
from fosforio import get_dataframe

snowflake.get_connection(connection_name="ME_AD_SALES_CXN")

--OLD code: DO NOT RUN
df_all = get_dataframe("DF_ALL")
df_opt = get_dataframe("DF_OPT")

In [None]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [None]:
table_name = 'DF_ALL'
df_all = my_session.sql("select * from {}".format(table_name))
df_all = df_all.to_pandas()
## df_all

In [None]:
table_name = 'DF_OPT'
df_opt = my_session.sql("select * from {}".format(table_name))
df_opt = df_opt.to_pandas()
## df_opt

In [None]:
df_all_copy = df_all.copy()
df_opt_copy = df_opt.copy()

In [None]:
df_all

In [None]:
df_all.info()

In [None]:
df_opt.info()

# Predictive Modelling

In [None]:
df_all.AD_DATE = pd.to_datetime(df_all.AD_DATE)
df_opt.AD_DATE = pd.to_datetime(df_opt.AD_DATE)
df_all_copy.AD_DATE = pd.to_datetime(df_all_copy.AD_DATE)
df_opt_copy.AD_DATE = pd.to_datetime(df_opt_copy.AD_DATE)

In [None]:
training_data = df_all[df_all.AD_DATE <= '2024-07-05']
testing_data = df_all[df_all.AD_DATE > '2024-07-05']

In [None]:
training_data.drop(['SITE_ID','ADVERTISER_ID',
        'AD_DATE', 'AD_TYPE',   'LINE_ITEM_GROUP',
            'CITY', 'POPULATION', 'CITY_LAT', 'CITY_LON'],axis=1,inplace=True)
testing_data.drop(['SITE_ID','ADVERTISER_ID',
        'AD_DATE', 'AD_TYPE',  'LINE_ITEM_GROUP',
            'CITY', 'POPULATION', 'CITY_LAT', 'CITY_LON'],axis=1,inplace=True)


In [None]:
X_train, y_train = training_data.drop('TOTAL_REVENUE', axis=1), training_data['TOTAL_REVENUE']
X_test, y_test = testing_data.drop('TOTAL_REVENUE', axis=1), testing_data['TOTAL_REVENUE']

In [None]:
X_train

In [None]:
pc_col = ['TOTAL_IMPRESSIONS', 'VIEWABLE_IMPRESSIONS', 'MEASURABLE_IMPRESSIONS']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=2))
        ]), pc_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['DEVICE_TYPE', 'LINE_ITEM_TYPE', 'OS_TYPE', 'AD_FORMAT',
       'MONETIZATION_CHANNEL', 'AD_MEDIA_TYPE'])
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [None]:
models = [
    {
        'name': 'RandomForestRegressor',
        'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [ 200],
        'regressor__max_depth': [10],
        'regressor__min_samples_split': [2],
        'regressor__min_samples_leaf': [3],
        'regressor__bootstrap': [True]
    },
]

# models = [
#     {
#         'name': 'RandomForestRegressor',
#         'regressor': [RandomForestRegressor()],
#         'regressor__n_estimators': [50, 150, 100, 175, 200],
#         'regressor__max_depth': [10, 2, 5, 7, 9, 12],
#         'regressor__min_samples_split': [2, 3, 4, 5, 6],
#         'regressor__min_samples_leaf': [1, 2, 3],
#         'regressor__bootstrap': [True, False]
#     },
# ]

In [None]:

best_estimators = []
for model_params in models:
    model_name = model_params.pop('name')  # Extract the model name
    grid_search = GridSearchCV(pipeline, model_params, cv=3, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimator = grid_search.best_estimator_
    best_estimators.append(best_estimator)
    print(f"Training completed for model {model_name}")
    
    # Save the best model
    joblib.dump(best_estimator, f'best_model_{model_name}.pkl')
    print(f"Best model {model_name} saved to best_model_{model_name}.pkl")

In [None]:
results = []
for estimator in best_estimators:
    y_pred_train = estimator.predict(X_train)
    y_pred_test = estimator.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_test)
    r2 = r2_score(y_test, y_pred_test)
    results.append({
        'model': estimator.named_steps['regressor'].__class__.__name__,
        'best_params': estimator.named_steps['regressor'].get_params(),
        'mse': mse,
        'r2': r2
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
df_all_copy

In [None]:
df_all_copy['PREDICTED_REVENUE'] = best_estimator.predict(df_all_copy)

In [None]:
df_opt_copy

In [None]:
df_opt_copy['OPT_REVENUE'] = best_estimator.predict(df_opt_copy)

In [None]:
df_all_copy_training_data = df_all_copy[df_all_copy.AD_DATE <= '2024-07-05']
df_all_copy_testing_data = df_all_copy[df_all_copy.AD_DATE > '2024-07-05']

In [None]:
df_opt_copy_training_data = df_opt_copy[df_opt_copy.AD_DATE <= '2024-07-05']
df_opt_copy_testing_data = df_opt_copy[df_opt_copy.AD_DATE > '2024-07-05']

# Pushing Model output to Snowflake

In [None]:
import os
from snowflake.snowpark.session import Session
user = os.getenv("user")
warehouse = os.getenv("warehouse")
schema= os.getenv("schema")
database = os.getenv("database")
role =  os.getenv("role")
account =  os.getenv("account")
password= os.getenv("password")

connection_params = dict(user=user, 
                         password=password, 
                         account=account, 
                         warehouse=warehouse, 
                         database=database,
                         schema=schema, 
                         role=role)

session = Session.builder.configs(connection_params).create()

session.sql('use warehouse {};'.format(warehouse)).collect()

session.sql('use database {};'.format(database)).collect()

session.sql('use schema {}.{};'.format(database, schema)).collect()

In [None]:
df_opt_copy

In [None]:
df_opt_copy.rename(columns={"TOTAL_IMPRESSIONS" : "OPT_TOTAL_IMPRESSIONS","VIEWABLE_IMPRESSIONS" : "OPT_VIEWABLE_IMPRESSIONS","MEASURABLE_IMPRESSIONS" : "OPT_MEASURABLE_IMPRESSIONS","AD_TYPE" : "OPT_AD_TYPE","AD_FORMAT" : "OPT_AD_FORMAT","AD_MEDIA_TYPE" : "OPT_AD_MEDIA_TYPE","LINE_ITEM_GROUP" : "OPT_LINE_ITEM_GROUP","LINE_ITEM_TYPE" : "OPT_LINE_ITEM_TYPE", "MONETIZATION_CHANNEL" : "OPT_MONETIZATION_CHANNEL", "OS_TYPE" : "OPT_OS_TYPE", "DEVICE_TYPE" : "OPT_DEVICE_TYPE"}, inplace=True)

df_opt_copy_testing_data.rename(columns={"TOTAL_IMPRESSIONS" : "OPT_TOTAL_IMPRESSIONS","VIEWABLE_IMPRESSIONS" : "OPT_VIEWABLE_IMPRESSIONS","MEASURABLE_IMPRESSIONS" : "OPT_MEASURABLE_IMPRESSIONS","AD_TYPE" : "OPT_AD_TYPE","AD_FORMAT" : "OPT_AD_FORMAT","AD_MEDIA_TYPE" : "OPT_AD_MEDIA_TYPE","LINE_ITEM_GROUP" : "OPT_LINE_ITEM_GROUP","LINE_ITEM_TYPE" : "OPT_LINE_ITEM_TYPE", "MONETIZATION_CHANNEL" : "OPT_MONETIZATION_CHANNEL", "OS_TYPE" : "OPT_OS_TYPE", "DEVICE_TYPE" : "OPT_DEVICE_TYPE"}, inplace=True)

df_opt_copy_training_data.rename(columns={"TOTAL_IMPRESSIONS" : "OPT_TOTAL_IMPRESSIONS","VIEWABLE_IMPRESSIONS" : "OPT_VIEWABLE_IMPRESSIONS","MEASURABLE_IMPRESSIONS" : "OPT_MEASURABLE_IMPRESSIONS","AD_TYPE" : "OPT_AD_TYPE","AD_FORMAT" : "OPT_AD_FORMAT","AD_MEDIA_TYPE" : "OPT_AD_MEDIA_TYPE","LINE_ITEM_GROUP" : "OPT_LINE_ITEM_GROUP","LINE_ITEM_TYPE" : "OPT_LINE_ITEM_TYPE", "MONETIZATION_CHANNEL" : "OPT_MONETIZATION_CHANNEL", "OS_TYPE" : "OPT_OS_TYPE", "DEVICE_TYPE" : "OPT_DEVICE_TYPE"}, inplace=True)



In [None]:
df_all_copy['TOTAL_REVENUE'].sum(), df_all_copy['PREDICTED_REVENUE'].sum()

In [None]:
df_opt_copy['TOTAL_REVENUE'].sum(), df_opt_copy['OPT_REVENUE'].sum()

In [None]:
df_all_copy['TOTAL_REVENUE'].sum()

In [None]:
r2 = r2_score(df_opt_copy['TOTAL_REVENUE'], df_opt_copy['OPT_REVENUE'])
r2

In [None]:
# df_snowflake = session.createDataFrame(df_all_copy.values.tolist(),
#         schema = df_all_copy.columns.tolist())

# df_snowflake.write.mode("overwrite").save_as_table("ME_DB.ME_AD_SALES_SCHEMA.FULL_OUTPUT")




# df_snowflake = session.createDataFrame(df_all_copy_training_data.values.tolist(),
#         schema = df_all_copy_training_data.columns.tolist())

# df_snowflake.write.mode("overwrite").save_as_table("ME_DB.ME_AD_SALES_SCHEMA.RAW_TABLE")


# df_snowflake = session.createDataFrame(df_all_copy_testing_data.values.tolist(),
#         schema = df_all_copy_testing_data.columns.tolist())

# df_snowflake.write.mode("overwrite").save_as_table("ME_DB.ME_AD_SALES_SCHEMA.FUTURE_PREDICTION_TABLE")

# df_snowflake = session.createDataFrame(df_opt_copy.values.tolist(),
#         schema = df_opt_copy.columns.tolist())

# df_snowflake.write.mode("overwrite").save_as_table("ME_DB.ME_AD_SALES_SCHEMA.OPT_FULL_OUTPUT")


# df_snowflake = session.createDataFrame(df_opt_copy_training_data.values.tolist(),
#         schema = df_opt_copy_training_data.columns.tolist())

# df_snowflake.write.mode("overwrite").save_as_table("ME_DB.ME_AD_SALES_SCHEMA.OPT_RAW_TABLE")

# df_snowflake = session.createDataFrame(df_opt_copy_testing_data.values.tolist(),
#         schema = df_opt_copy_testing_data.columns.tolist())

# df_snowflake.write.mode("overwrite").save_as_table("ME_DB.ME_AD_SALES_SCHEMA.FUTURE_OPT_TABLE")


In [None]:
df_all_copy_training_data.shape[0], df_all_copy_testing_data.shape[0]

In [None]:
assert df_all_copy_training_data.shape[0] + df_all_copy_testing_data.shape[0] == df_all_copy.shape[0], \
    "The sum of training and testing data rows does not match the total number of rows in df_all_copy"

In [None]:
df_all_copy['AD_DATE'] = df_all_copy['AD_DATE'].astype(str)
df_all_copy_training_data['AD_DATE'] = df_all_copy_training_data['AD_DATE'].astype(str)
df_all_copy_testing_data['AD_DATE'] = df_all_copy_testing_data['AD_DATE'].astype(str)
df_opt_copy['AD_DATE'] = df_opt_copy['AD_DATE'].astype(str)
df_opt_copy_training_data['AD_DATE'] = df_opt_copy_training_data['AD_DATE'].astype(str)
df_opt_copy_testing_data['AD_DATE'] = df_opt_copy_testing_data['AD_DATE'].astype(str)

In [None]:
session.write_pandas(df_all_copy, "FULL_OUTPUT", auto_create_table=True,  overwrite=True)
session.write_pandas(df_all_copy_training_data, "RAW_TABLE", auto_create_table=True,  overwrite=True)
session.write_pandas(df_all_copy_testing_data, "FUTURE_PREDICTION_TABLE", auto_create_table=True,  overwrite=True)

session.write_pandas(df_opt_copy, "OPT_FULL_OUTPUT", auto_create_table=True,  overwrite=True)
session.write_pandas(df_opt_copy_training_data, "OPT_RAW_TABLE", auto_create_table=True,  overwrite=True)
session.write_pandas(df_opt_copy_testing_data, "FUTURE_OPT_TABLE", auto_create_table=True,  overwrite=True)

In [None]:
df_opt_copy_testing_data.shape[0], session.table(['FUTURE_OPT_TABLE']).to_pandas().shape[0]

In [None]:
assert df_opt_copy_training_data.shape[0] + df_opt_copy_testing_data.shape[0] == df_opt_copy.shape[0], \
    "The sum of training and testing data rows does not match the total number of rows in df_all_copy"

In [None]:
assert df_opt_copy.shape[0] == session.table(['OPT_FULL_OUTPUT']).to_pandas().shape[0], "rows are not matching"

In [None]:
assert df_all_copy_training_data.shape[0] + df_all_copy_testing_data.shape[0] == df_all_copy.shape[0], \
    "The sum of training and testing data rows does not match the total number of rows in df_all_copy"

In [None]:
assert df_all_copy.shape[0] == session.table(['FULL_OUTPUT']).to_pandas().shape[0], "rows are not matching"

In [None]:
assert df_all_copy_training_data.shape[0] == session.table(['RAW_TABLE']).to_pandas().shape[0], "rows are not matching"

In [None]:
assert df_all_copy_testing_data.shape[0] == session.table(['FUTURE_PREDICTION_TABLE']).to_pandas().shape[0], "rows are not matching"

In [None]:
assert df_opt_copy_training_data.shape[0] == session.table(['OPT_RAW_TABLE']).to_pandas().shape[0], "rows are not matching"

In [None]:
assert df_opt_copy_testing_data.shape[0] == session.table(['FUTURE_OPT_TABLE']).to_pandas().shape[0], "rows are not matching"

In [None]:
assert df_opt_copy.shape[0] == session.table(['OPT_FULL_OUTPUT']).to_pandas().shape[0], "rows are not matching"

# Model Registrartion using fosforml SDK


In [None]:
#Snowpark lib
from snowflake.snowpark import Session
from fosforio import snowflake
from sklearn.pipeline import Pipeline
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests


In [None]:
@scoring_func
def score(model, request):

    import json
    payload = request.json["payload"]
    if isinstance(request.json["payload"],str):
        payload_data = eval(payload)
        if isinstance(payload_data['TOTAL_IMPRESSIONS'], int):
                data_json = eval(payload)
                data = pd.DataFrame([data_json])
                prediction = pd.DataFrame(model.predict(data))
                return prediction[0].to_list()[0]
        elif isinstance(payload_data['TOTAL_IMPRESSIONS'], dict):
                data = pd.DataFrame(eval(payload))
                prediction = pd.DataFrame(model.predict(data))
                return prediction[0].tolist()
        elif isinstance(payload_data['TOTAL_IMPRESSIONS'], list):
                data = pd.DataFrame(payload_data)
                prediction = pd.DataFrame(model.predict(data))
                return prediction.tolist()
    return "This method is not allowed"

In [None]:
import requests

payload = str(X_test.iloc[1:3].to_dict())
req = requests.Request()
req.json = {"payload": payload}
print(score(best_estimator, req))

In [None]:
req.json

In [None]:
## registering the model in Fosfor.
model_reg = register_model(best_estimator,
               score, 
               name="Ad_Sales_Prediction_Model", 
               description="Ad_Sales_Prediction_RandomForest_Model",
               flavour=MLModelFlavours.sklearn,
               model_type="regression",
               init_script="\\n pip install scikit-learn==1.5.1 --no-deps\\n pip install joblib==1.4.2\\n pip install scipy==1.13.1\\n pip install threadpoolctl==3.5.0\\n pip install fosforml==1.0.1\\n pip install fosforio==1.0.1 --no-deps\\n pip install holidays==0.9.9\\n pip install pandas==2.2.2 --no-deps\\n pip install holidays==0.9.9\\n pip install python-dateutil==2.9.0\\n pip install pytz==2024.1\\n pip install six==1.16.0\\n pip install tzdata==2024.1\\n pip install numpy==1.26.4",
               y_true=y_test,
               y_pred=y_pred_test,
               #prob=y_prob,
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train,
               y_test=y_test,
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)