In [None]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [None]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_train_pre.csv'

# Read CSV file from S3 bucket into DataFrame
X_train_prep = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_train_prep.head()

In [None]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_val_prep.csv'

# Read CSV file from S3 bucket into DataFrame
X_val_prep = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_val_prep.head()

In [None]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_test_prep.csv'

# Read CSV file from S3 bucket into DataFrame
X_test_prep = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_test_prep.head()

# Baseline Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
log_reg2 = LogisticRegression(max_iter=1000, solver='lbfgs')

# Fit the model on the balanced text training data
log_reg2.fit(X_train_prep.iloc[:, :-2], X_train_prep.iloc[:, -1])

# Predict on the validation set
y_val_pred = log_reg2.predict(X_val_prep.iloc[:, :-2])

# Evaluate the model
print("Baseline: Validation Set Accuracy:", accuracy_score(X_val_prep.iloc[:, -1], y_val_pred))
print("\nBaselin: Validation Set Classification Report:\n", classification_report(X_val_prep.iloc[:, -1], y_val_pred))

In [None]:
y_test_pred = log_reg2.predict(X_test_prep.iloc[:, :-2])
print("Baseline: Test Set Accuracy:", accuracy_score(X_test_prep.iloc[:, -1], y_test_pred))
print("\nBaselin: Test Set Classification Report:\n", classification_report(X_test_prep.iloc[:, -1], y_test_pred))

# SageMaker AutoPilot

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.automl.automl import AutoML

role = get_execution_role()
session = sagemaker.Session()

# Replace 'your_dataset.csv' with the name of your CSV file
input_data = 's3://508group/ADS508_project/cleandata/df_final.csv'

autopilot_job = AutoML(
    role=role,
    target_attribute_name='sentimentoutcome',  # This is the column you're predicting
    output_path='s3://508group/ADS508_project/output/autopilot/',
    max_candidates=20,
    sagemaker_session=session,
    problem_type='MulticlassClassification',  # Assuming sentimentoutcome is multiclass
    job_objective={'MetricName': 'Accuracy'}  # You can choose another metric if it fits better
)

autopilot_job.fit(inputs=input_data, wait=False, logs=True)

In [None]:
import boto3
import tarfile
import os

# Initialize the S3 client
s3 = boto3.client('s3')

# S3 bucket and object key
bucket_name = '508group'
object_key = 'ADS508_project/output/autopilot/automl-2024-03-30-19-51-00-981/data-processor-models/automl-2024-03-30-19-51-00-981-dpp9-1-89acfb1b856c4754873c58ddd/output/model.tar.gz'

# Local directory to extract the contents
extract_dir = "ADS508_project/output/autopilot/automl-2024-03-30-19-51-00-981/data-processor-models/automl-2024-03-30-19-51-00-981-dpp9-1-89acfb1b856c4754873c58ddd/output/model.tar.gz"

# Create the directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Download the file from S3
local_file_path = os.path.join(extract_dir, 'model.tar.gz')
s3.download_file(bucket_name, object_key, local_file_path)

# Open the tar file and extract the contents
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)

# After extraction, you can further process or use the contents of the extracted files
# For example, list the extracted files
extracted_files = os.listdir(extract_dir)
print("Extracted files:", extracted_files)

In [None]:
# Download the file from S3
local_file_path = os.path.join(extract_dir, 'model.tar.gz')
s3.download_file(bucket_name, object_key, local_file_path)

# Extract the contents of the tar.gz file
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)


In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.automl.automl import AutoML
import time
from datetime import datetime
from time import strftime
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")
role = get_execution_role()
session = sagemaker.Session()
# Replace 'your_dataset.csv' with the name of your CSV file
#input_data = 's3://508group/ADS508_project/cleandata/df_final.csv'
input_data = 's3://sagemaker-us-east-1-851725336500/ADS508_project/cleandata/final_data.csv'
autopilot_job = AutoML(
    role=role,
    target_attribute_name='sentimentoutcome',  # This is the column you're predicting
    #output_path='s3://508group/ADS508_project/output/autopilot/',
    output_path= 's3://sagemaker-us-east-1-851725336500/ADS508_project/output/autopilot/',
    max_candidates=20,
    sagemaker_session=session,
    problem_type='MulticlassClassification',  # Assuming sentimentoutcome is multiclass
    job_objective={'MetricName': 'Accuracy'}  # You can choose another metric if it fits better
)
autopilot_job.fit(inputs=input_data, wait=True, logs=True)
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")


# Using SageMaker for Ensemble Models

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

# Define IAM role
role = get_execution_role()

# Define SageMaker session
sess = sagemaker.Session()

# Define S3 bucket and prefix
bucket = 'your-s3-bucket-name'
prefix = 'your-prefix'

# Specify the training data location in S3
train_input = TrainingInput(f's3://{bucket}/{prefix}/train.csv', content_type='text/csv')

# Get the XGBoost container image
container = get_image_uri(sess.boto_region_name, 'xgboost', repo_version='1.2-1')

# Define hyperparameters
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:linear",
    "num_round": "50"
}

# Create an estimator object
xgb_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    hyperparameters=hyperparameters
)

# Set the output location for model artifacts
output_path = f's3://{bucket}/{prefix}/output'
xgb_estimator.output_path = output_path

# Train the model
xgb_estimator.fit({'train': train_input})

In [None]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

# Define S3 bucket and prefix for training data and output
prefix = 'ADS508_project/cleandata/'

# Specify the path to the training data in S3
train_data_uri = f's3://{bucket}/{prefix}/X_train_prep.csv'

# Specify the hyperparameters for the XGBoost model
hyperparameters = {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'eta': 0.1,
    'gamma': 1
}

# Create an instance of the XGBoost estimator
xgb_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve('xgboost', region),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.xlarge'
)

# Set hyperparameters
xgb_estimator.set_hyperparameters(**hyperparameters)

# Start the training job
xgb_estimator.fit({'train': train_data_uri})

# Multilayer Perceptrons

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sagemaker.sklearn import SKLearn
import joblib
import sagemaker

# Define the training logic directly in your Python code
def train_model():
    # Load training data (replace this with your actual data loading code)
    X_train, y_train = load_training_data()

    # Define the MLPClassifier model
    mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, activation='relu', solver='adam', verbose=True, tol=0.001, alpha=0.0001)

    # Train the model
    mlp_model.fit(X_train, y_train)

    # Save the model to disk
    model_path = 'model.joblib'
    joblib.dump(mlp_model, model_path)

    return model_path

# Train the model
model_path = train_model()

# Define S3 bucket and prefix for training data and output
bucket = 'your-s3-bucket'
prefix = 'your-s3-prefix'

# Specify the path to the training data in S3
train_data_uri = f's3://{bucket}/{prefix}/train.pkl'

# Create an instance of the SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point=train_model,  # Pass the train_model function directly
    source_dir='.',  # Directory containing the Python code
    instance_type='ml.m4.xlarge',
    role=sagemaker.get_execution_role(),
    framework_version='0.23-1',
    py_version='py3',
    output_path=f's3://{bucket}/{prefix}/output'
)

# Start the training job
sklearn_estimator.fit({'train': train_data_uri})
