In [19]:
import pickle

import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [20]:
from sklearn.pipeline import make_pipeline

In [21]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green-taxi-duration")


<Experiment: artifact_location='s3://mlflow-artifacts-remote-amogh/10', creation_time=1687412973867, experiment_id='10', last_update_time=1687412973867, lifecycle_stage='active', name='green-taxi-duration', tags={}>

In [7]:
print(mlflow.environment_variables.MLFLOW_S3_ENDPOINT_URL)

s3://mlflow-artifacts-remote-amogh/


In [6]:
mlflow.environment_variables.MLFLOW_S3_ENDPOINT_URL = 's3://mlflow-artifacts-remote-amogh/'

In [8]:
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df


def prepare_dictionaries(df: pd.DataFrame):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [9]:
df_train = read_dataframe('../module-3/data/green_tripdata_2022-01.parquet')
df_val = read_dataframe('../module-3/data/green_tripdata_2022-02.parquet')

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

dict_train = prepare_dictionaries(df_train)
dict_val = prepare_dictionaries(df_val)

In [22]:
df_train.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,PU_DO
0,2,2022-01-01 00:14:21,2022-01-01 00:15:33,N,1.0,42,42,1.0,0.44,3.5,...,0.0,0.0,,0.3,4.8,2.0,1.0,0.0,1.2,42_42
1,1,2022-01-01 00:20:55,2022-01-01 00:29:38,N,1.0,116,41,1.0,2.1,9.5,...,0.0,0.0,,0.3,10.8,2.0,1.0,0.0,8.716667,116_41
2,1,2022-01-01 00:57:02,2022-01-01 01:13:14,N,1.0,41,140,1.0,3.7,14.5,...,4.6,0.0,,0.3,23.15,1.0,1.0,2.75,16.2,41_140
3,2,2022-01-01 00:07:42,2022-01-01 00:15:57,N,1.0,181,181,1.0,1.69,8.0,...,0.0,0.0,,0.3,9.3,2.0,1.0,0.0,8.25,181_181
4,2,2022-01-01 00:07:50,2022-01-01 00:28:52,N,1.0,33,170,1.0,6.26,22.0,...,5.21,0.0,,0.3,31.26,1.0,1.0,2.75,21.033333,33_170


In [91]:
mlflow.environment_variables.MLFLOW_S3_UPLOAD_EXTRA_ARGS='{"AWS_ACCESS_KEY_ID": "AKIA26DG3QS4VH26HJOQ", "C7OK5VNnQtLSXJSVZxd091Kq63M0lRL6Ue5I1Rhy"}'

In [92]:
print(mlflow.environment_variables.MLFLOW_S3_UPLOAD_EXTRA_ARGS)

{"AWS_ACCESS_KEY_ID": "AKIA26DG3QS4VH26HJOQ", "C7OK5VNnQtLSXJSVZxd091Kq63M0lRL6Ue5I1Rhy"}


In [23]:
with mlflow.start_run():
    params = dict(max_depth=20, n_estimators=100, min_samples_leaf=10, random_state=0)
    mlflow.log_params(params)

    pipeline = make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params, n_jobs=-1)
    )

    pipeline.fit(dict_train, y_train)
    y_pred = pipeline.predict(dict_val)

    rmse = mean_squared_error(y_pred, y_val, squared=False)
    print(params, rmse)
    mlflow.log_metric('rmse', rmse)

    mlflow.sklearn.log_model(pipeline, artifact_path="module-4/")

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 6.101201727016495


In [17]:
mlflow.end_run()

In [10]:
from mlflow.tracking import MlflowClient


In [20]:
MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
RUN_ID = 'b4d3bca8aa8e46a6b8257fe4541b1136'

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [21]:
path = client.download_artifacts(run_id=RUN_ID, path='dict_vectorizer.bin')

In [22]:
with open(path, 'rb') as f_out:
    dv = pickle.load(f_out)

In [23]:
dv

DictVectorizer()

In [25]:
!echo $AWS_ACCESS_KEY_ID




In [10]:
import os

In [14]:
print(os.getenv('AWS_PROFILE'))

default


In [49]:
os.environ['AWS_ACCESS_KEY_ID'] = 'AKIA26DG3QS4VH26HJOQ'

In [50]:
os.environ['AWS_SECRET_ACCESS_KEY'] = 'C7OK5VNnQtLSXJSVZxd091Kq63M0lRL6Ue5I1Rhy'

In [12]:
os.environ['AWS_PROFILE'] = 'default'

In [15]:
mlflow.get_artifact_uri()

's3://mlflow-artifacts-remote-amogh/10/22c678d7ae004760a0a748f291368476/artifacts'

In [52]:
print(os.environ['AWS_ACCESS_KEY_ID'])

C7OK5VNnQtLSXJSVZxd091Kq63M0lRL6Ue5I1Rhy


zsh:1: /usr/local/bin/aws: bad interpreter: /usr/local/opt/python/bin/python3.7: no such file or directory


In [55]:
!pwd

/Users/amoghkulkarni/Documents/RS/machine-learning-zoomcamp/my-github-repo/ml-ops-zoomcamp/module-2


In [48]:
import boto3

In [84]:
client = boto3.client(
    "s3"
)

In [85]:
client.upload_file(Filename='../module-4/web-service/predict.py', Bucket='mlflow-artifacts-remote-amogh', Key='model-artifacts/')

S3UploadFailedError: Failed to upload ../module-4/web-service/predict.py to mlflow-artifacts-remote-amogh/model-artifacts/: An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.

In [28]:
def count_numbers_in_array(a, k):
    number_count_dict = {}
    key_match_list = []
    for idx, each_num in enumerate(a):
        string_key = str(each_num)
        if string_key in number_count_dict:
            number_count_dict[string_key]['count'] += 1
            number_count_dict[string_key]['indices'].append(idx)
        else:

            number_count_dict.update({
            string_key: {
                    'count': 1,
                    'indices': [idx]
                }
            })
    
    print(number_count_dict)
    for each_key in number_count_dict:
        if number_count_dict[each_key]['count'] >= k:
            key_match_list.append(each_key)
    
    first_occurrence = -1
    if key_match_list:
        max_occurence_idx = max(number_count_dict[key_match_list[0]]['indices'][:k])
        first_occurrence = key_match_list[0]
        print(f'max_occurence_idx = {max_occurence_idx}, first_occurrence = {first_occurrence}')

        for each_num in key_match_list:
            if max_occurence_idx > max(number_count_dict[each_num]['indices'][:k]):
                first_occurrence = each_num
                max_occurence_idx = max(number_count_dict[each_num]['indices'][:k])
                print(f'max_occurence_idx = {max_occurence_idx}, first_occurrence = {first_occurrence}')
    
    return int(first_occurrence)

In [29]:
a = [3, 2, 2, 1, 4, 1, 4, 2, 1, 3, 4]
count_numbers_in_array(a, 2)

{'3': {'count': 2, 'indices': [0, 9]}, '2': {'count': 3, 'indices': [1, 2, 7]}, '1': {'count': 3, 'indices': [3, 5, 8]}, '4': {'count': 3, 'indices': [4, 6, 10]}}
max_occurence_idx = 9, first_occurrence = 3
max_occurence_idx = 2, first_occurrence = 2


2

In [30]:
a = [3, 2, 2, 1, 4, 1, 4, 2, 1, 3, 4]

In [33]:
ar={}
for i in a:
    ar[i]=ar.get(i,0)+1
    if ar[i]==2:
        print(i)
print(-1) 

2
1
4
3
-1


In [34]:
def find_matching_number(a, k):
    count_map = {}
    for num in a:
        if num in count_map:
            count_map[num] += 1
        else:
            count_map[num] = 1

        if count_map[num] >= k:
            return num

    return -1

In [35]:
a = [3, 2, 2, 1, 4, 1, 4, 2, 1, 3, 4]
find_matching_number(a, 2)

2

In [36]:
a = [3, 1, 4, 1, 4, 2, 1, 3, 4]
find_matching_number(a, 2)

1