In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sagemaker
import boto3

# Data Ingestion

In [None]:
boto3_session = boto3.Session(region_name='us-east-1')
sm_session = sagemaker.Session(boto_session=boto3_session)
region = sm_session.boto_session.region_name
bucket = 'bucket-aws-classification'

In [25]:
data = pd.read_csv('../Datasets/Mobile_Price_Classification_Train.csv')

In [26]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [27]:
data.shape

(2000, 21)

In [28]:
data.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [29]:
data['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [30]:
data.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [31]:
features = list(data.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [32]:
label = features.pop()
label

'price_range'

In [33]:
X = data[features]
y = data[label]

In [34]:
X.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [35]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [36]:
y.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

## Data Splitting 

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [38]:
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)



X_train.shape:  (1600, 20)
y_train.shape:  (1600,)
X_test.shape:  (400, 20)
y_test.shape:  (400,)


In [40]:
train_X = pd.DataFrame(X_train)
train_X[label] = y_train

test_X = pd.DataFrame(X_test)
test_X[label] = y_test


In [43]:
print(train_X.shape)
print(test_X.shape)

(1600, 21)
(400, 21)


In [44]:
train_X.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
582,1232,0,2.9,1,1,1,24,0.3,169,5,...,361,809,1257,16,10,16,1,0,0,0
159,1840,0,0.5,1,12,0,34,0.7,142,1,...,311,1545,1078,8,0,10,0,0,0,1
1827,1692,0,2.1,0,4,1,2,0.9,106,1,...,1899,1904,3779,9,3,7,1,1,1,3
318,508,0,0.8,0,7,1,42,0.3,94,1,...,39,557,663,13,12,7,1,0,0,0
708,977,1,2.8,1,2,0,35,0.6,165,2,...,1502,1862,3714,19,3,10,0,1,1,3


In [45]:
train_X.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [46]:
test_X.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

## Save Splits

In [None]:
train_X.to_csv('../Datasets/Mobile_Price_Classification_Train_V1.csv', index=False)
test_X.to_csv('../Datasets/Mobile_Price_Classification_Test_V1.csv', index=False)

## Save data to S3

In [49]:
sk_prefix = "datasets"

trainpath = sm_session.upload_data(
    path='../Datasets/Mobile_Price_Classification_Train_V1.csv', 
    bucket=bucket, 
    key_prefix=sk_prefix
)

testpath = sm_session.upload_data(
    path='../Datasets/Mobile_Price_Classification_Test_V1.csv', 
    bucket=bucket, 
    key_prefix=sk_prefix
)

In [50]:
print(trainpath)
print(testpath)

s3://bucket-aws-classification/datasets/Mobile_Price_Classification_Train_V1.csv
s3://bucket-aws-classification/datasets/Mobile_Price_Classification_Test_V1.csv


In [53]:
%%writefile ../script.py

import numpy as np
import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score

import boto3
import joblib
import pathlib
from io import StringIO
import argparse


def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

if __name__ = "__main__":
    print('[INFO] Extracting arguments')
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the scripts.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="Mobile_Price_Classification_Train_V1.csv")
    parser.add_argument("--test-file", type=str, default="Mobile_Price_Classification_Test_V1.csv")

    args, _ = parser.parse_known_args()

    print("SKLearn Version", skearn.__version__)
    print("Joblib Version", joblib.__version__)

    print('[INFO] Reading Data')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop()

    print('Building training and testing datasets')
    X_train = train_df[features]
    y_train = train_df[label]
    X_test = test_df[features]    
    y_test = test_df[label]

    print('Column order: ', features)
    print('Label:', label)

    print('Data shape')
    print('Training Data shape: ')
    print(X_train.shape)
    print(y_train.shape)

    print('Test Data shape: ')
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    print('Training RandomForest Model ...')
    model = RandomForestClassifier(n_estimaors=args.n_estimators , random_state=args.random_state, verbose=True)
    model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print('Model persisted at', model_path)
    print()

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_report = classification_report(y_test, y_pred_test)

    print()
    print('------ Metrics Results for Testing Data --------')
    print()
    print('Total Rows are: ', X_test.shape[0])
    print('[TESTING] Model Accuracy is :', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_report)

    














Writing ../script.py
