# Load Libraries

In [3]:
import boto3
import sagemaker
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import sagemaker
from sagemaker import get_execution_role
from sagemaker.automl.automl import AutoML
import tarfile
import os
import time
from datetime import datetime
from time import strftime
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sklearn.neural_network import MLPClassifier
from sagemaker.sklearn import SKLearn
import joblib

from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.metrics import confusion_matrix

# Open S3 Session

In [4]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

# Read in Training

In [8]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_train_prep.csv'

# Read CSV file from S3 bucket into DataFrame
X_train_prep = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_train_prep.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,state_id_TX,state_id_UK,state_id_UT,state_id_VA,state_id_VT,state_id_WA,state_id_WI,state_id_WV,state_id_WY,outcome
0,-0.085612,-0.192749,0.032051,0.016743,-0.010959,-0.20829,-0.152666,0.29764,-0.075811,-0.289037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Biden_nega
1,-0.212201,-0.275853,0.037405,0.060544,-0.318471,-0.185872,-0.159038,0.012011,0.002532,-0.199857,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Biden_nega
2,0.086982,0.119298,0.000721,0.070849,0.07804,-0.039077,-0.05584,0.138417,-0.041773,-0.203287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Biden_nega
3,-0.049801,-0.02338,0.220462,0.187448,0.186836,-0.236208,-0.012966,0.230729,0.106897,-0.068943,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Biden_nega
4,0.043111,-0.095598,0.186926,0.210929,0.124437,-0.180557,0.122059,0.101528,-0.059285,-0.112962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Biden_nega


# Read in Validation

In [9]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_val_prep.csv'

# Read CSV file from S3 bucket into DataFrame
X_val_prep = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_val_prep.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,state_id_TX,state_id_UK,state_id_UT,state_id_VA,state_id_VT,state_id_WA,state_id_WI,state_id_WV,state_id_WY,outcome
0,-0.020926,0.038315,-0.015478,0.129784,-0.072584,-0.225882,0.116951,0.300709,-0.293697,-0.131742,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Biden_posi
1,-0.119196,-0.174804,-0.049492,0.200089,0.020939,-0.033172,-0.113429,0.294107,-0.243626,0.038364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trump_posi
2,0.023994,0.048203,0.132835,0.11642,0.221791,-0.14496,-0.026205,0.083955,0.105637,-0.224725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trump_neut
3,0.075427,0.040858,0.099023,-0.004378,0.03457,-0.170367,0.052725,0.375484,0.045429,-0.236863,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trump_posi
4,0.12747,-0.034008,0.087723,0.135007,0.221004,-0.149747,0.064742,0.26064,0.016842,0.024366,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Trump_posi


# Read in Testing

In [None]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_test_prep.csv'

# Read CSV file from S3 bucket into DataFrame
X_test_prep = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_test_prep.head()

# Baseline Model

In [None]:
# Initialize the Logistic Regression model
log_reg2 = LogisticRegression(max_iter=1000, solver='lbfgs')

# Fit the model on the balanced text training data, excluding outcome variable and 1 dummy variable
# Outcome variable is last column of training data
log_reg2.fit(X_train_prep.iloc[:, :-2], X_train_prep.iloc[:, -1])

# Predict on the validation set
y_val_pred = log_reg2.predict(X_val_prep.iloc[:, :-2])

# Evaluate the model
print("Baseline: Validation Set Accuracy:", accuracy_score(X_val_prep.iloc[:, -1], y_val_pred))
print("\nBaselin: Validation Set Classification Report:\n", classification_report(X_val_prep.iloc[:, -1], y_val_pred))

In [None]:
# Predict on the test set
y_test_pred = log_reg2.predict(X_test_prep.iloc[:, :-2])
print("Baseline: Test Set Accuracy:", accuracy_score(X_test_prep.iloc[:, -1], y_test_pred))
print("\nBaselin: Test Set Classification Report:\n", classification_report(X_test_prep.iloc[:, -1], y_test_pred))

# SageMaker AutoPilot

In [None]:
role = get_execution_role()
session = sagemaker.Session()

# Replace 'your_dataset.csv' with the name of your CSV file
input_data = 's3://508group/ADS508_project/cleandata/df_final.csv'

autopilot_job = AutoML(
    role=role,
    target_attribute_name='sentimentoutcome',  # This is the column you're predicting
    output_path='s3://508group/ADS508_project/output/autopilot/',
    max_candidates=20,
    sagemaker_session=session,
    problem_type='MulticlassClassification',  # Assuming sentimentoutcome is multiclass
    job_objective={'MetricName': 'Accuracy'}  # You can choose another metric if it fits better
)

autopilot_job.fit(inputs=input_data, wait=False, logs=True)

In [None]:
# Initialize the S3 client
s3 = boto3.client('s3')

# S3 bucket and object key
bucket_name = '508group'
object_key = 'ADS508_project/output/autopilot/automl-2024-03-30-19-51-00-981/data-processor-models/automl-2024-03-30-19-51-00-981-dpp9-1-89acfb1b856c4754873c58ddd/output/model.tar.gz'

# Local directory to extract the contents
extract_dir = "ADS508_project/output/autopilot/automl-2024-03-30-19-51-00-981/data-processor-models/automl-2024-03-30-19-51-00-981-dpp9-1-89acfb1b856c4754873c58ddd/output/model.tar.gz"

# Create the directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Download the file from S3
local_file_path = os.path.join(extract_dir, 'model.tar.gz')
s3.download_file(bucket_name, object_key, local_file_path)

# Open the tar file and extract the contents
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)

# After extraction, you can further process or use the contents of the extracted files
# For example, list the extracted files
extracted_files = os.listdir(extract_dir)
print("Extracted files:", extracted_files)

In [None]:
# Download the file from S3
local_file_path = os.path.join(extract_dir, 'model.tar.gz')
s3.download_file(bucket_name, object_key, local_file_path)

# Extract the contents of the tar.gz file
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)


In [None]:
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")
role = get_execution_role()
session = sagemaker.Session()
# Replace 'your_dataset.csv' with the name of your CSV file
#input_data = 's3://508group/ADS508_project/cleandata/df_final.csv'
input_data = 's3://sagemaker-us-east-1-851725336500/ADS508_project/cleandata/final_data.csv'
autopilot_job = AutoML(
    role=role,
    target_attribute_name='sentimentoutcome',  # This is the column you're predicting
    #output_path='s3://508group/ADS508_project/output/autopilot/',
    output_path= 's3://sagemaker-us-east-1-851725336500/ADS508_project/output/autopilot/',
    max_candidates=20,
    sagemaker_session=session,
    problem_type='MulticlassClassification',  # Assuming sentimentoutcome is multiclass
    job_objective={'MetricName': 'Accuracy'}  # You can choose another metric if it fits better
)
autopilot_job.fit(inputs=input_data, wait=True, logs=True)
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")


# Using SageMaker for Ensemble Models

In [5]:
# Define S3 prefix
prefix = 'ADS508_project/cleandata/'

In [7]:
# Specify the training data location in S3
train_input = TrainingInput(f's3://{bucket}/{prefix}/X_train_prep.csv', content_type='text/csv')

# Get the XGBoost container image
container = get_image_uri(sess.boto_region_name, 'xgboost', repo_version='1.2-1')

# Define hyperparameters
hyperparameters = {
    "max_depth": "6",
    "lambda": "0.015482990568956176",
    "min_child_weight": "0.0016695027194522027",
    "objective": "multi:softprob",
    "num_round": "967"
}

# Create an estimator object
xgb_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.12xlarge',
    hyperparameters=hyperparameters
)

# Set the output location for model artifacts
output_path = f's3://{bucket}/{prefix}/output'
xgb_estimator.output_path = output_path

# Train the model
xgb_estimator.fit({'train': train_input})

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


ClientError: An error occurred (AccessDeniedException) when calling the CreateTrainingJob operation: User: arn:aws:sts::293275163623:assumed-role/LabRole/SageMaker is not authorized to perform: sagemaker:CreateTrainingJob on resource: arn:aws:sagemaker:us-east-1:293275163623:training-job/sagemaker-xgboost-2024-04-15-22-11-09-105 with an explicit deny in an identity-based policy

## Get Performance Predictions

In [None]:
# Define S3 bucket and prefix for training data and output
prefix = 'ADS508_project/cleandata/'

# Specify the path to the training data in S3
train_data_uri = f's3://{bucket}/{prefix}/X_train_prep.csv'

# Specify the hyperparameters for the XGBoost model
# Define hyperparameters
hyperparameters = {
    "max_depth": "6",
    "lambda": "0.015482990568956176",
    "min_child_weight": "0.0016695027194522027",
    "objective": "multi:softprob",
    "num_round": "967"
}

# Create an instance of the XGBoost estimator
xgb_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve('xgboost', region),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.12xlarge'
)

# Set hyperparameters
xgb_estimator.set_hyperparameters(**hyperparameters)

# Start the training job
xgb_estimator.fit({'train': train_data_uri})

In [None]:
# Create a predictor object
predictor = Predictor(
    endpoint_name=xgb_estimator.latest_training_job.name,  # Use the training job name as the endpoint name
    sagemaker_session=sess,
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

# Define the validation data location in S3
val_data_location = f's3://{bucket}/{prefix}/X_val_prep.csv'

# Evaluate on validation data
validation_predictions = predictor.predict(val_data_location)

# Define the testing data location in S3
test_data_location = f's3://{bucket}/{prefix}/X_test_prep.csv'

# Evaluate on testing data
test_predictions = predictor.predict(test_data_location)


In [None]:
# Convert predictions to binary labels (if needed)
val_pred_labels = [1 if pred > 0.5 else 0 for pred in validation_predictions['predictions']]
test_pred_labels = [1 if pred > 0.5 else 0 for pred in test_predictions['predictions']]

# Define true labels for validation and testing data (assuming they are available)
# Replace these with the true labels from your data
y_val_true = [1, 0, 1, 0, ...]  # Example true labels for validation data
y_test_true = [1, 0, 1, 0, ...]  # Example true labels for testing data

# Calculate confusion matrix for validation and testing data
val_conf_matrix = confusion_matrix(y_val_true, val_pred_labels)
test_conf_matrix = confusion_matrix(y_test_true, test_pred_labels)

print("Validation Data Confusion Matrix:")
print(val_conf_matrix)
print("\nTesting Data Confusion Matrix:")
print(test_conf_matrix)

# Multilayer Perceptrons

In [None]:
# Define the training logic directly in your Python code
def train_model():
    # Load training data (replace this with your actual data loading code)
    X_train, y_train = load_training_data()

    # Define the MLPClassifier model
    mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, activation='relu', solver='adam', verbose=True, tol=0.001, alpha=0.0001)

    # Train the model
    mlp_model.fit(X_train, y_train)

    # Save the model to disk
    model_path = 'model.joblib'
    joblib.dump(mlp_model, model_path)

    return model_path

# Train the model
model_path = train_model()

# Define S3 bucket and prefix for training data and output
bucket = 'your-s3-bucket'
prefix = 'your-s3-prefix'

# Specify the path to the training data in S3
train_data_uri = f's3://{bucket}/{prefix}/train.pkl'

# Create an instance of the SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point=train_model,  # Pass the train_model function directly
    source_dir='.',  # Directory containing the Python code
    instance_type='ml.m4.12xlarge',
    role=sagemaker.get_execution_role(),
    framework_version='0.23-1',
    py_version='py3',
    output_path=f's3://{bucket}/{prefix}/output'
)

# Start the training job
sklearn_estimator.fit({'train': train_data_uri})
