In [2]:
import json
import boto3
import pickle
from io import BytesIO
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sagemaker import get_execution_role
import sagemaker

In [3]:
sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)

Using bucket sagemaker-us-east-1-132067349151


In [4]:
region

'us-east-1'

In [5]:
RANDOM_STATE = 100

In [6]:
# Run this cell to import or install the Data Wrangler widget to show automatic visualization and generate code to fix data quality issues

try:
    import sagemaker_datawrangler
except ImportError:
    !pip install --upgrade sagemaker-datawrangler
    import sagemaker_datawrangler

# Display Pandas DataFrame to view the widget: df, display(df), df.sample()... 

In [7]:
source_bucket = "greenstoragetest"
data_key = "df.csv"
data_location = 's3://{}/{}'.format(source_bucket, data_key)
df = pd.read_csv(data_location, index_col = 0)

In [8]:
df.describe()

                  id          cycle             IR             QC  \
count  100501.000000  100501.000000  100501.000000  100501.000000   
mean       62.377051     490.156615       0.016629       1.031048   
std        40.362826     388.039203       0.001370       0.056753   
min         0.000000       1.000000       0.000000       0.000000   
25%        23.000000     203.000000       0.015571       1.014073   
50%        66.000000     409.000000       0.016601       1.048886   
75%        99.000000     675.000000       0.017355       1.067319   
max       123.000000    2236.000000       0.024405       2.965895   

                  QD           Tavg           Tmin           Tmax  \
count  100501.000000  100501.000000  100501.000000  100501.000000   
mean        1.030881      34.069687      31.212060      37.791954   
std         0.056205       2.012331       1.633117       2.647476   
min         0.000000       0.000000       0.000000       0.000000   
25%         1.013984      32.8351

In [9]:
def preprocess_data(df):
    df = df.dropna()
    scaler = StandardScaler()
    scaler.fit(df)
    scaler.transform(df)
    return df
    
def regression_process(df, train = False):
    irdf = []
    qdf = []
    time = []
    ir=[]
    Q = []
    T = []
    new_df = pd.DataFrame()
    if train:
        y = []
    for bat in df['id'].unique():
        ir10 = df.loc[df['id'] == bat][df['cycle']==10]['IR']
        ir100 = df.loc[df['id'] == bat][df['cycle']==100]['IR']
        dif = float(ir100) - float(ir10)
        irdf.append(dif)
        q10 = df.loc[df['id'] == bat][df['cycle']==10]['QD']
        q100 = df.loc[df['id'] == bat][df['cycle']==100]['QD']
        dif = float(q100) - float(q10)
        qdf.append(dif)
        t10 = df.loc[df['id'] == bat][df['cycle']==10]['chargetime']
        t100 = df.loc[df['id'] == bat][df['cycle']==100]['chargetime']
        dif = float(t100)- float(t10)
        time.append(dif)
        ir.append(float(df.loc[df['id'] == bat][df['cycle']==100]['IR']))
        Q.append(float(df.loc[df['id'] == bat][df['cycle']==100]['QD']))
        T.append(float(df.loc[df['id'] == bat][df['cycle']==100]['Tmax']))
        if train:
            y.append(int(df.loc[df['id'] == bat][df['cycle'] == 10]['cycle_life']))
    new_df['id'] = df['id'].unique()
    new_df['IR_var'] = irdf
    new_df['QD_var'] = qdf
    new_df['time'] = time
    new_df['ir'] = ir
    new_df['Q'] = Q
    new_df['T'] = T
    if train:
        return new_df, y
    return new_df

def classify(df):
    df = df.copy()
    df['percent'] = df['cycle']/df['cycle_life']
    df.loc[df['percent'] < 0.3, 'condition'] = '2'
    df.loc[df['percent'] >= 0.3, 'condition'] = '1'
    df.loc[df['percent'] >= 0.7, 'condition'] = '0'
    del df['percent']
    X = df.copy()
    y = df['condition']
    del X['condition']
    clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(32, 64), random_state=RANDOM_STATE, max_iter=300, early_stopping = False, validation_fraction=0.3)
    clf.fit(X,y)
    print("NN Classification score:")
    print(clf.score(X,y))
    return clf

def rf_classify(df):
    df = df.copy()
    df['percent'] = df['cycle']/df['cycle_life']
    df.loc[df['percent'] < 0.3, 'condition'] = '2'
    df.loc[df['percent'] >= 0.3, 'condition'] = '1'
    df.loc[df['percent'] >= 0.7, 'condition'] = '0'
    del df['percent']
    X = df.copy()
    y = df['condition']
    del X['condition']
    clf = RandomForestClassifier(max_depth = 10, random_state = RANDOM_STATE)
    clf.fit(X,y)
    print("RF Classification score:")
    print(clf.score(X,y))
    return clf


def random_regressor(X_train, X_test, y_train, y_test):
    clf = RandomForestRegressor(max_depth = 10, random_state = RANDOM_STATE)
    clf.fit(X_train, y_train)
    print(clf.score(X_train, y_train))
    
    ypred = clf.predict(X_train)
    mse = mean_squared_error(y_train, ypred)

    print("The metrics on train set:")
    print("MSE: ", mse)
    print("RMSE: ", mse**(1/2.0))

    ypred = clf.predict(X_test)
    mse = mean_squared_error(y_test, ypred)
    print("The metrics on test set:")
    print("MSE: ", mse)
    print("RMSE: ", mse**(1/2.0))
    return clf

def linear_regressor(X_train, X_test, y_train, y_test): 
    clf = LinearRegression()
    clf.fit(X_train, y_train)
    print(clf.score(X_train, y_train))
    
    ypred = clf.predict(X_train)
    mse = mean_squared_error(y_train, ypred)

    print("The metrics on train set:")
    print("MSE: ", mse)
    print("RMSE: ", mse**(1/2.0))

    ypred = clf.predict(X_test)
    mse = mean_squared_error(y_test, ypred)
    print("The metrics on test set:")
    print("MSE: ", mse)
    print("RMSE: ", mse**(1/2.0))
    return clf

In [10]:
#df = preprocess_data(df)
#X, y = regression_process(df, train = True)
X = df.copy()
y = df['cycle_life']
del X['cycle_life']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = RANDOM_STATE)


In [11]:
X_train

        id  cycle        IR        QC        QD       Tavg       Tmin  \
65801   90  817.0  0.015672  1.046837  1.046945  33.412024  31.691275   
66952   91  133.0  0.015251  1.065956  1.065956  33.217787  31.281336   
17170   14   53.0  0.019886  1.068417  1.067529  32.159572  29.959637   
8284     4  606.0  0.016995  1.063003  1.063165  31.584512  29.561604   
49721   65  288.0  0.017400  1.063145  1.063460  34.190475  30.533981   
...    ...    ...       ...       ...       ...        ...        ...   
65615   90  631.0  0.015480  1.054487  1.054618  34.221615  32.559619   
77655  101  649.0  0.015042  1.038318  1.038507  37.413149  34.230814   
79683  103  378.0  0.015261  1.053189  1.053685  34.489628  32.033016   
56088   77  446.0  0.022267  0.843200  0.840406  34.987186  30.550098   
38408   43  179.0  0.017738  1.064017  1.064562  32.875427  29.087925   

            Tmax  chargetime  
65801  35.193736   11.039118  
66952  35.466999   10.044068  
17170  35.576157    9.875475  

In [12]:
X_train["target"] = y_train
X_test["target"] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
rf_reg = random_regressor(X_train, X_test, y_train, y_test)

0.999999988654558
The metrics on train set:
MSE:  0.002472412562492243
RMSE:  0.04972336032985143
The metrics on test set:
MSE:  0.0022477089841859694
RMSE:  0.04741000932488802


In [14]:
X_train.to_csv("train.csv")
X_test.to_csv("test.csv")

In [15]:
trainpath = sess.upload_data(
    path="train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

testpath = sess.upload_data(
    path="test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

In [16]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train.csv")
    parser.add_argument("--test-file", type=str, default="test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)

Overwriting script.py


In [17]:
X_train

        id  cycle        IR        QC        QD       Tavg       Tmin  \
65801   90  817.0  0.015672  1.046837  1.046945  33.412024  31.691275   
66952   91  133.0  0.015251  1.065956  1.065956  33.217787  31.281336   
17170   14   53.0  0.019886  1.068417  1.067529  32.159572  29.959637   
8284     4  606.0  0.016995  1.063003  1.063165  31.584512  29.561604   
49721   65  288.0  0.017400  1.063145  1.063460  34.190475  30.533981   
...    ...    ...       ...       ...       ...        ...        ...   
65615   90  631.0  0.015480  1.054487  1.054618  34.221615  32.559619   
77655  101  649.0  0.015042  1.038318  1.038507  37.413149  34.230814   
79683  103  378.0  0.015261  1.053189  1.053685  34.489628  32.033016   
56088   77  446.0  0.022267  0.843200  0.840406  34.987186  30.550098   
38408   43  179.0  0.017738  1.064017  1.064562  32.875427  29.087925   

            Tmax  chargetime  target  
65801  35.193736   11.039118  1836.0  
66952  35.466999   10.044068   828.0  
17170 

In [32]:
! python script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'cycle IR QC QD Tavg Tmin Tmax chargetime' \
                   --target target
#'IR_var QD_var time ir Q T' 

extracting arguments
reading data
building training and testing datasets
training model
validating model
AE-at-10th-percentile: 0.0
AE-at-50th-percentile: 0.7705000000000268
AE-at-90th-percentile: 12.495000000000005
model persisted at ./model.joblib
2


In [18]:
import joblib

In [19]:
loaded_model = joblib.load('model.joblib')

In [20]:
X_train.iloc[100]

id               3.000000
cycle         1253.000000
IR               0.016887
QC               0.956192
QD               0.955989
Tavg            29.518670
Tmin            28.165857
Tmax            30.773926
chargetime      12.321622
target        1434.000000
Name: 7498, dtype: float64

In [21]:
temp = X_train.iloc[100].copy()
del temp['target']
del temp['id']
y = X_train['target']
del X_train['target']
del X_train['id']

In [27]:
dictionary

{'result': array([1434.]),
 'eval': 0.9988171244181822,
 'condition': array([0.87377964]),
 'tmax': 30.773926,
 'tmin': 28.165857,
 'tavg': 29.51867037119673}

In [30]:
result = loaded_model.predict([temp])
evaluation = loaded_model.score(X_train,y)
condition = temp['cycle']/result
tmax = temp['Tmax']
tmin = temp['Tmin']
tavg = temp['Tavg']
dictionary = {
    "result": result[0],
    "eval" : evaluation,
    "condition" : condition[0],
    "tmax" : tmax,
    "tmin" : tmin,
    "tavg" : tavg
    "cycle" : temp['cycle']
}
json_object = json.dumps(dictionary)
with open("prediction_soh.json", "w") as outfile:
    outfile.write(json_object)
sess.upload_data(
    path="prediction_soh.json", bucket="greenstoragetest", key_prefix="sagemaker/sklearncontainer"
)

's3://greenstoragetest/sagemaker/sklearncontainer/prediction_soh.json'

In [17]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "IR_var QD_var time ir Q T",
        "target": "target",
    },
)

In [18]:
trainpath

's3://sagemaker-us-east-1-132067349151/sagemaker/sklearncontainer/train.csv'

In [19]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: rf-scikit-2023-05-06-01-19-25-605


2023-05-06 01:19:26 Starting - Starting the training job...
2023-05-06 01:20:01 Starting - Preparing the instances for training.........
2023-05-06 01:21:26 Downloading - Downloading input data...
2023-05-06 01:21:56 Training - Downloading the training image.....[34m2023-05-06 01:22:45,317 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-05-06 01:22:45,321 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-06 01:22:45,365 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-05-06 01:22:45,556 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-06 01:22:45,570 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-06 01:22:45,583 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-06 01:22:45,594 sagemaker-train

In [20]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2023-05-06 01:23:04 Starting - Preparing the instances for training
2023-05-06 01:23:04 Downloading - Downloading input data
2023-05-06 01:23:04 Training - Training image download completed. Training in progress.
2023-05-06 01:23:04 Uploading - Uploading generated training model
2023-05-06 01:23:04 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-132067349151/rf-scikit-2023-05-06-01-19-25-605/output/model.tar.gz


In [21]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [26]:
predictor = model.deploy(instance_type="ml.m4.xlarge", initial_instance_count=1)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2023-05-06-01-35-33-248
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2023-05-06-01-35-33-945
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2023-05-06-01-35-33-945


------!

In [27]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


{'ResponseMetadata': {'RequestId': '1bcfd0ec-1af1-4440-9d36-05a3eb1571cd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1bcfd0ec-1af1-4440-9d36-05a3eb1571cd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 06 May 2023 01:48:10 GMT'},
  'RetryAttempts': 0}}