In [None]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [None]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/final_data.csv'

# Read CSV file from S3 bucket into DataFrame
df_final = pd.read_csv(f's3://{bucket_name}/{file_path}')

df_final.head()

In [None]:
df_final.shape

In [None]:
X = df_final[['clean_text', 'state_id']]
y = df_final['sentimentoutcome']

In [None]:
from sklearn.model_selection import train_test_split

# Split data into 90% train and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# Split remaining data (90% train) into 90% train and 10% validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/9, random_state=1)

In [None]:
#!pip install imblearn
from imblearn.under_sampling import RandomUnderSampler

# Undersample the training set to balance the six categorical outcome levels
undersampler = RandomUnderSampler(sampling_strategy='all', random_state=1)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

In [None]:
X_train_resampled.info()

In [None]:
# Update Datatypes
X_train_resampled['clean_text'] = X_train_resampled['clean_text'].astype(str)
X_train_resampled['state_id'] = X_train_resampled['state_id'].astype('category')

In [None]:
X_train_resampled.info()

In [None]:
X_test.info()

In [None]:
# Update Datatypes
X_test['clean_text'] = X_test['clean_text'].astype(str)
X_test['state_id'] = X_test['state_id'].astype('category')

In [None]:
X_val.info()

In [None]:
# Update Datatypes
X_val['clean_text'] = X_val['clean_text'].astype(str)
X_val['state_id'] = X_val['state_id'].astype('category')

In [None]:
X_train_resampled.shape, X_test.shape, X_val.shape

In [None]:
X_train_resampled.head()

In [None]:
X_val.head()

In [None]:
X_test.head()

In [None]:
y_train_resampled.value_counts()

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np

class DistilBERTTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialize the DistilBERT tokenizer and model
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.model.eval()  # Set model to evaluation mode
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, batch_size=32):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        embeddings = []
        for i in tqdm(range(0, len(X), batch_size)):
            batch = X[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            # Adjust based on the actual structure of the outputs
            embeddings.append(outputs[0].mean(dim=1).detach().cpu().numpy())  # Use mean pooling
        return np.vstack(embeddings)

In [None]:
distilbert_transformer = DistilBERTTransformer()

In [None]:
# Transform 'clean_text' into DistilBERT embeddings for the training set
texts_train = X_train_resampled['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_train = distilbert_transformer.transform(texts_train)

In [None]:
# Transform 'clean_text' into DistilBERT embeddings for the validation set
texts_val = X_val['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_val = distilbert_transformer.transform(texts_val)

In [None]:
# Transform 'clean_text' into DistilBERT embeddings for the test set
texts_test = X_test['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_test = distilbert_transformer.transform(texts_test)

In [None]:
distilbert_embeddings_train = pd.DataFrame(distilbert_embeddings_train)

In [None]:
distilbert_embeddings_val = pd.DataFrame(distilbert_embeddings_val)

In [None]:
distilbert_embeddings_test = pd.DataFrame(distilbert_embeddings_test)

In [None]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_train_bert.csv"

# Create an S3 client
s3 = boto3.client('s3')

distilbert_embeddings_train.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)