In [2]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [3]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/final_data.csv'

# Read CSV file from S3 bucket into DataFrame
df_final = pd.read_csv(f's3://{bucket_name}/{file_path}')

df_final.head()

Unnamed: 0,clean_text,state_id,sentimentoutcome
0,No one attacks David Attenborough yet for bein...,UK,Biden_nega
1,The Adnams and the White Wine is tasting that ...,UK,Biden_posi
2,philipaklein Biden VoteBidenHarris2020,NJ,Biden_neut
3,GOP realDonaldTrump without Biden we won t ha...,NY,Biden_neut
4,Washington Reader Many of us get your dissati...,CA,Biden_posi


In [4]:
df_final.shape

(367831, 3)

In [5]:
X = df_final[['clean_text', 'state_id']]
y = df_final['sentimentoutcome']

In [6]:
from sklearn.model_selection import train_test_split

# Split data into 90% train and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# Split remaining data (90% train) into 90% train and 10% validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/9, random_state=1)

In [7]:
#!pip install imblearn
from imblearn.under_sampling import RandomUnderSampler

# Undersample the training set to balance the six categorical outcome levels
undersampler = RandomUnderSampler(sampling_strategy='all', random_state=1)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

In [8]:
X_train_resampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53382 entries, 13593 to 359016
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  53382 non-null  object
 1   state_id    53382 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [9]:
# Update Datatypes
X_train_resampled['clean_text'] = X_train_resampled['clean_text'].astype(str)
X_train_resampled['state_id'] = X_train_resampled['state_id'].astype('category')

In [10]:
X_train_resampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53382 entries, 13593 to 359016
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   clean_text  53382 non-null  object  
 1   state_id    53382 non-null  category
dtypes: category(1), object(1)
memory usage: 888.7+ KB


In [11]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36784 entries, 252769 to 318746
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  36784 non-null  object
 1   state_id    36784 non-null  object
dtypes: object(2)
memory usage: 862.1+ KB


In [12]:
# Update Datatypes
X_test['clean_text'] = X_test['clean_text'].astype(str)
X_test['state_id'] = X_test['state_id'].astype('category')

In [13]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36783 entries, 139254 to 290197
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  36783 non-null  object
 1   state_id    36783 non-null  object
dtypes: object(2)
memory usage: 862.1+ KB


In [14]:
# Update Datatypes
X_val['clean_text'] = X_val['clean_text'].astype(str)
X_val['state_id'] = X_val['state_id'].astype('category')

In [15]:
X_train_resampled.shape, X_test.shape, X_val.shape

((53382, 2), (36784, 2), (36783, 2))

In [16]:
X_train_resampled.head()

Unnamed: 0,clean_text,state_id
13593,Andy Excuse me It is was the Dem leadership t...,CA
120356,Biden was a few one liners away from a decisive,TX
11253,MeOregon How dare you tell him that he would ...,CA
222260,Biden must use selected surrogates like Pete ...,MA
110164,Joe Biden s insane war on oil t co zoYQtDM0IS ...,NY


In [17]:
X_val.head()

Unnamed: 0,clean_text,state_id
139254,It s one of the great ironies of the impeachm...,CA
25211,I think that some large number of people vote...,CA
16680,Hope Donald Trump reads this profile in wisdom...,MA
45661,Trump is good at Branding if nothing else He ...,NY
332714,Looking for some corroboration that Trump and...,WY


In [18]:
X_test.head()

Unnamed: 0,clean_text,state_id
252769,redwave trump Iowa Poll Trump takes lead from...,CA
5129,I hope Mr Krugman is right but let us not forg...,VA
311264,The Republican party had planted the seeds fo...,IL
144058,Note to anyone moving into rural areas functio...,VA
325429,ReaganBabe RudyGiuliani realDonaldTrump God i...,OH


In [19]:
y_train_resampled.value_counts()

Trump_posi    8897
Biden_neut    8897
Biden_nega    8897
Trump_nega    8897
Trump_neut    8897
Biden_posi    8897
Name: sentimentoutcome, dtype: int64

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np

class DistilBERTTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialize the DistilBERT tokenizer and model
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.model.eval()  # Set model to evaluation mode
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, batch_size=32):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        embeddings = []
        for i in tqdm(range(0, len(X), batch_size)):
            batch = X[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            # Adjust based on the actual structure of the outputs
            embeddings.append(outputs[0].mean(dim=1).detach().cpu().numpy())  # Use mean pooling
        return np.vstack(embeddings)

In [None]:
distilbert_transformer = DistilBERTTransformer()

In [None]:
# Transform 'clean_text' into DistilBERT embeddings for the training set
texts_train = X_train_resampled['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_train = distilbert_transformer.transform(texts_train)

In [None]:
# Transform 'clean_text' into DistilBERT embeddings for the validation set
texts_val = X_val['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_val = distilbert_transformer.transform(texts_val)

In [None]:
# Transform 'clean_text' into DistilBERT embeddings for the test set
texts_test = X_test['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_test = distilbert_transformer.transform(texts_test)

In [None]:
distilbert_embeddings_train = pd.DataFrame(distilbert_embeddings_train)

In [None]:
distilbert_embeddings_val = pd.DataFrame(distilbert_embeddings_val)

In [None]:
distilbert_embeddings_test = pd.DataFrame(distilbert_embeddings_test)

In [None]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_train_bert.csv"

# Create an S3 client
s3 = boto3.client('s3')

distilbert_embeddings_train.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [None]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_val_bert.csv"

# Create an S3 client
s3 = boto3.client('s3')

distilbert_embeddings_val.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [None]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_test_bert.csv"

# Create an S3 client
s3 = boto3.client('s3')

distilbert_embeddings_test.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [20]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_train_bert.csv'

# Read CSV file from S3 bucket into DataFrame
X_train_bert = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_train_bert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.085612,-0.192749,0.032051,0.016743,-0.010959,-0.20829,-0.152666,0.29764,-0.075811,-0.289037,...,-0.051246,-0.05732,0.021458,-0.139488,0.22383,-0.117648,0.026695,0.062894,0.122807,0.16778
1,-0.212201,-0.275853,0.037405,0.060544,-0.318471,-0.185872,-0.159038,0.012011,0.002532,-0.199857,...,0.087239,0.060687,0.062437,-0.106997,0.098862,0.013423,-0.174617,0.141263,0.14134,-0.053384
2,0.086982,0.119298,0.000721,0.070849,0.07804,-0.039077,-0.05584,0.138417,-0.041773,-0.203287,...,0.111256,0.139934,0.032308,-0.068434,0.078629,0.111907,0.060399,-0.001812,0.050431,0.13146
3,-0.049801,-0.02338,0.220462,0.187448,0.186836,-0.236208,-0.012966,0.230729,0.106897,-0.068943,...,-0.259642,-0.02483,-0.060597,-0.137388,-0.05015,0.119264,0.116246,-0.047168,0.374222,-0.127046
4,0.043111,-0.095598,0.186926,0.210929,0.124437,-0.180557,0.122059,0.101528,-0.059285,-0.112962,...,-0.077179,0.12032,0.029574,-0.136716,0.225667,-0.055894,0.025724,-0.03809,0.036344,0.152835


In [21]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_val_bert.csv'

# Read CSV file from S3 bucket into DataFrame
X_val_bert = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_val_bert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.020926,0.038315,-0.015478,0.129784,-0.072584,-0.225882,0.116951,0.300709,-0.293697,-0.131742,...,-0.101473,0.071148,-0.027571,-0.178902,0.10473,-0.13727,0.19023,0.047491,-0.023825,-0.075363
1,-0.119196,-0.174804,-0.049492,0.200089,0.020939,-0.033172,-0.113429,0.294107,-0.243626,0.038364,...,-0.145654,0.171014,0.051357,0.006363,0.185543,-0.155665,0.167436,-0.028334,0.032127,0.038196
2,0.023994,0.048203,0.132835,0.11642,0.221791,-0.14496,-0.026205,0.083955,0.105637,-0.224725,...,0.116835,0.203131,0.046252,-0.027101,0.172244,0.098827,0.036564,0.036217,0.111981,0.082348
3,0.075427,0.040858,0.099023,-0.004378,0.03457,-0.170367,0.052725,0.375484,0.045429,-0.236863,...,-0.018941,-0.022619,0.02549,-0.11198,0.260225,0.115471,0.041441,-0.069634,0.114876,-0.070295
4,0.12747,-0.034008,0.087723,0.135007,0.221004,-0.149747,0.064742,0.26064,0.016842,0.024366,...,-0.134856,0.003863,-0.123585,-0.110707,0.044464,0.125721,-0.036738,-0.023276,0.15841,0.098184


In [22]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_test_bert.csv'

# Read CSV file from S3 bucket into DataFrame
X_test_bert = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_test_bert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.148748,-0.152604,0.41074,0.202116,0.01805,-0.217776,-0.022319,0.141656,0.01117,-0.119315,...,-0.04948,0.372685,-0.040204,-0.064451,0.203771,-0.123107,0.046268,0.038076,0.113842,0.21568
1,-0.028749,-0.105872,0.180524,0.218901,0.207626,-0.304348,-0.144124,0.339819,0.003144,-0.105429,...,0.006992,0.158822,-0.007336,-0.092584,0.222685,-0.048937,0.046246,-0.08752,0.137091,0.061183
2,-0.045923,-0.106403,-0.000612,0.198213,0.093984,-0.156729,-0.039995,0.291425,-0.124625,-0.062035,...,0.057361,0.214756,0.046441,-0.093943,0.152749,-0.130637,0.149418,-0.123415,0.063259,-0.038831
3,0.203205,-0.088243,0.242572,-0.091901,0.203277,-0.030754,0.085939,0.190463,-0.022583,-0.07925,...,-0.014764,0.016855,-0.00733,-0.096256,0.076464,-0.211883,0.041285,-0.305218,-0.115182,0.086422
4,-0.05351,0.01892,0.234078,0.105144,0.229481,-0.106339,-0.051445,0.502994,-0.241178,-0.332092,...,-0.159063,-0.02154,0.13638,-0.066743,0.057919,-0.100449,-0.008937,-0.04925,0.198628,0.050045


In [23]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
onehot_encoder = OneHotEncoder(sparse = False) 

# Assuming X_train, X_val, X_test are your datasets
state_ids_train = X_train_resampled[['state_id']] # Reshape for the encoder
state_id_encoded_train = onehot_encoder.fit_transform(state_ids_train)

# Get the feature names (levels of the categorical variable)
feature_names = onehot_encoder.get_feature_names_out(['state_id'])

# Convert the encoded matrix into a DataFrame
state_id_encoded_train_df = pd.DataFrame(state_id_encoded_train, columns=feature_names)



In [24]:
# Assuming X_train, X_val, X_test are your datasets
state_ids_val = X_val[['state_id']] # Reshape for the encoder
state_id_encoded_val = onehot_encoder.fit_transform(state_ids_val)

# Get the feature names (levels of the categorical variable)
feature_names = onehot_encoder.get_feature_names_out(['state_id'])

# Convert the encoded matrix into a DataFrame
state_id_encoded_val_df = pd.DataFrame(state_id_encoded_val, columns=feature_names)



In [25]:
# Assuming X_train, X_val, X_test are your datasets
state_ids_test = X_test[['state_id']] # Reshape for the encoder
state_id_encoded_test = onehot_encoder.fit_transform(state_ids_test)

# Get the feature names (levels of the categorical variable)
feature_names = onehot_encoder.get_feature_names_out(['state_id'])

# Convert the encoded matrix into a DataFrame
state_id_encoded_test_df = pd.DataFrame(state_id_encoded_test, columns=feature_names)



In [26]:
import numpy as np
#Combine with sate ID that is one hot encoded
X_train_prep = pd.concat([X_train_bert, state_id_encoded_train_df], axis=1)

In [27]:
X_val_prep = pd.concat([X_val_bert, state_id_encoded_val_df], axis=1)

In [28]:
X_test_prep = pd.concat([X_test_bert, state_id_encoded_test_df], axis=1)

In [29]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_train_prep.csv"

# Create an S3 client
s3 = boto3.client('s3')

X_train_prep.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [None]:
# Add outcome column to training set
import pandas as pd
import boto3
from io import StringIO

# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_train_prep.csv'

# Read CSV file from S3 bucket into DataFrame
s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key=file_path)
X_train_prep = pd.read_csv(response['Body'])

# Add a new column to the DataFrame
X_train_prep['outcome'] = pd.Categorical(y_train_resampled)

# Write the modified DataFrame to a CSV file
csv_buffer = StringIO()
X_train_prep.to_csv(csv_buffer, index=False)

# Upload the updated CSV file to the S3 bucket, replacing the original file
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

In [30]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_val_prep.csv"

# Create an S3 client
s3 = boto3.client('s3')

X_val_prep.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [None]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_val_prep.csv'

# Read CSV file from S3 bucket into DataFrame
s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key=file_path)
X_val_prep = pd.read_csv(response['Body'])

# Add a new column to the DataFrame
X_val_prep['outcome'] = pd.Categorical(y_val)

# Write the modified DataFrame to a CSV file
csv_buffer = StringIO()
X_val_prep.to_csv(csv_buffer, index=False)

# Upload the updated CSV file to the S3 bucket, replacing the original file
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

In [31]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_test_prep.csv"

# Create an S3 client
s3 = boto3.client('s3')

X_test_prep.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [None]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_test_prep.csv'

# Read CSV file from S3 bucket into DataFrame
s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key=file_path)
X_test_prep = pd.read_csv(response['Body'])

# Add a new column to the DataFrame
X_test_prep['outcome'] = pd.Categorical(y_test)

# Write the modified DataFrame to a CSV file
csv_buffer = StringIO()
X_test_prep.to_csv(csv_buffer, index=False)

# Upload the updated CSV file to the S3 bucket, replacing the original file
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())