In [1]:
# import libraries
import os 

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from helpers import *
from minio import Minio

In [2]:
# specify path to the local data directory 
data_path = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/data/"
data_path

'/home/abdoss/mlops-project-nba--investment/data/'

In [3]:
# initialize minioClient with an endpoint and access/secret keys.
minio_client = Minio('20.224.70.229:9000',
                    access_key='abdessamadbaahmed',
                    secret_key='baahmedabdessamad', secure=False)

# list all buckets
buckets = minio_client.list_buckets()
buckets

[Bucket('nba-investment-data')]

In [4]:
# upload a file to a bucket
#minio_client.fput_object("nba-investment-data", "nba_logreg_raw.csv", data_path + "nba_logreg_raw.csv")

In [5]:
# read the raw dataset from minio bucket if the service is down use the local dataset from the local file system
try:
    df = read_data_from_minio(minio_client, "nba-investment-data", "nba_logreg_raw.csv")
    display(df.head())
except Exception:
    df = read_data(f"{data_path}nba_logreg_raw.csv")
    #upload_file_to_minio(minio_client, f"{data_path}nba_logreg_raw.csv", "nba-investment-data", "nba_logreg_raw.csv")
    df.head()

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36.0,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,Andrew Harrison,35.0,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,JaKarr Sampson,74.0,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,Malik Sealy,58.0,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,Matt Geiger,48.0,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


In [6]:
# checking the data types
df.dtypes

Name            object
GP             float64
MIN            float64
PTS            float64
FGM            float64
FGA            float64
FG%            float64
3P Made        float64
3PA            float64
3P%            float64
FTM            float64
FTA            float64
FT%            float64
OREB           float64
DREB           float64
REB            float64
AST            float64
STL            float64
BLK            float64
TOV            float64
TARGET_5Yrs      int64
dtype: object

In [7]:
# dropping the name column
df.drop(["Name"], axis=1, inplace=True)

In [8]:
# showing the number of missing values
df.isnull().sum()

GP              0
MIN             0
PTS             0
FGM             0
FGA             0
FG%             0
3P Made         0
3PA             0
3P%            11
FTM             0
FTA             0
FT%             0
OREB            0
DREB            0
REB             0
AST             0
STL             0
BLK             0
TOV             0
TARGET_5Yrs     0
dtype: int64

In [9]:
# replacing the missing values with the mean of the column "3P%"
df["3P%"].fillna(df["3P%"].mean(), inplace=True)

In [10]:
# storing the dataset in a new csv file in the minio bucket
df.to_csv(f"{data_path}nba_logreg_preprocessed.csv", index=False)
upload_file_to_minio(minio_client, f"{data_path}nba_logreg_preprocessed.csv", "nba-investment-data", "nba_logreg_preprocessed.csv")

True

In [11]:
# splitting the dataset into train and test sets using stratified sampling 
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, stratify=df["TARGET_5Yrs"])

In [12]:
# storing the train and test sets in new csv files locally
train_set.to_csv(f"{data_path}nba_logreg_processed_train.csv", index=False)
test_set.to_csv(f"{data_path}nba_logreg_processed_test.csv", index=False)

# storing the train and test sets in new csv files in the minio bucket
upload_file_to_minio(minio_client, f"{data_path}nba_logreg_processed_train.csv", "nba-investment-data", "nba_logreg_processed_train.csv")
upload_file_to_minio(minio_client, f"{data_path}nba_logreg_processed_test.csv", "nba-investment-data", "nba_logreg_processed_test.csv")

True

In [13]:
view_files_in_bucket(minio_client, "nba-investment-data")

nba_logreg_preprocessed.csv
nba_logreg_processed_test.csv
nba_logreg_processed_train.csv
nba_logreg_raw.csv
