# Data Ingestion

## Copy CSV files to S3

In [11]:
# Assume all the pre-requisites were set up 
%store -r setup_instance_check_passed
%store -r setup_dependencies_passed
%store -r setup_s3_bucket_passed
%store -r setup_iam_roles_passed


In [12]:
# Load Libraries
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [50]:
# Public Data
!aws s3 ls s3://ads508team5/

# Copy Datasets from :
# Source of data
s3_public_path_tweeter = "s3://ads508team5/tweeter"
s3_public_path_nyt = "s3://ads508team5/nyt"
s3_public_path_cities = "s3://ads508team5/cities"

# Destination Data:
s3_private_path_tweeter = "s3://{}/ADS508_project/tweeter".format(bucket)
s3_private_path_nyt = "s3://{}/ADS508_project/nyt".format(bucket)
s3_private_path_cities = "s3://{}/ADS508_project/cities".format(bucket)

# Copy datasets
!aws s3 cp --recursive $s3_public_path_tweeter/ $s3_private_path_tweeter/ --exclude "*" --include "hashtag_donaldtrump.csv"
!aws s3 cp --recursive $s3_public_path_tweeter/ $s3_private_path_tweeter/ --exclude "*" --include "hashtag_joebiden.csv"
!aws s3 cp --recursive $s3_public_path_nyt/ $s3_private_path_nyt/ --exclude "*" --include "nyt-comments-2020.csv"
!aws s3 cp --recursive $s3_public_path_cities/ $s3_private_path_cities/ --exclude "*" --include "uscities.csv"

                           PRE cities/
                           PRE nyt/
                           PRE tweeter/
copy: s3://ads508team5/tweeter/hashtag_donaldtrump.csv to s3://sagemaker-us-east-1-851725336500/ADS508_project/tweeter/hashtag_donaldtrump.csv
copy: s3://ads508team5/tweeter/hashtag_joebiden.csv to s3://sagemaker-us-east-1-851725336500/ADS508_project/tweeter/hashtag_joebiden.csv
copy: s3://ads508team5/nyt/nyt-comments-2020.csv to s3://sagemaker-us-east-1-851725336500/ADS508_project/nyt/nyt-comments-2020.csv
copy: s3://ads508team5/cities/uscities.csv to s3://sagemaker-us-east-1-851725336500/ADS508_project/cities/uscities.csv


## Create Database, Tables and Parquets
### Database

In [None]:
# Setup and check pre-requisites to create Database
ingest_create_athena_db_passed = False

!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

In [None]:

# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Create Connection
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [None]:
# Create Database
database_name = "dbpoliticpulsecomment"

statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)

pd.read_sql(statement, conn)

# Verify DB successfully created
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)


In [None]:
# End of Create Database
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

### Tables

In [None]:
# Create Tweeter Tables
table_name_csv = "tweeter"
s3_private_path_tweeter = "s3://{}/ADS508_project/tweeter".format(bucket)

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
  created_at TIMESTAMP,
  tweet_id FLOAT,
  tweet VARCHAR(250),
  likes INT,
  retweet_count INT,
  source VARCHAR(45),
  user_id INT,
  user_name VARCHAR(250),
  user_screen_name VARCHAR(45),
  user_description VARCHAR(250),
  user_join_date TIMESTAMP,
  user_followers_count INT,
  user_location VARCHAR(45),
  lat FLOAT,
  long FLOAT,
  city VARCHAR(45),
  country VARCHAR(45),
  continent VARCHAR(45),
  state VARCHAR(45),
  state_code VARCHAR(45),
  collected_at VARCHAR(45)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_private_path_tweeter
)

pd.read_sql(statement, conn)


In [None]:
# testing hashtag_donaldtrump.csv tweet
tweet = "You get a tie! And you get a tie! #Trump ‘s rally #Iowa https://t.co/jJalUUmh5D"
# testing hashtag_joebiden.csv tweet
tweet = "@chrislongview Watching and setting dvr. Let’s give him bonus ratings!! #JoeBiden"


statement = """SELECT * FROM {}.{}
    WHERE tweet = '{}' LIMIT 100""".format(
    database_name, table_name_csv, tweet
)

df = pd.read_sql(statement, conn)
df.head(5)


In [None]:
# Create NYT comment Table
table_name_csv = "nyt_comment"
s3_private_path_nyt = "s3://{}/ADS508_project/nyt".format(bucket)


statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
  commentID INT,
  status VARCHAR(20),
  commentSequence INT,
  userID INT,
  userDisplayName VARCHAR(45),
  userLocation VARCHAR(45),
  userTitle VARCHAR(10),
  commentBody VARCHAR(500),
  createDate TIMESTAMP,
  updateDate TIMESTAMP,
  approveDate TIMESTAMP,
  recommendation INT,
  replyCount INT,
  editorsSelection VARCHAR(20),
  parentID INT,
  parentUserDisplayName VARCHAR(45),
  depth INT,
  commentType VARCHAR(20),
  trusted VARCHAR(20),
  recommendedFlag VARCHAR(20),
  permID INT,
  isAnonymous VARCHAR(20),
  articleID VARCHAR(150)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_private_path_nyt
)

pd.read_sql(statement, conn)


In [None]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

In [None]:
commentBody = "@Philip Brown Agree 110%."

statement = """SELECT * FROM {}.{}
    WHERE commentBody = '{}' LIMIT 100""".format(
    database_name, table_name_csv, commentBody
)

df = pd.read_sql(statement, conn)
df.head(5)


In [None]:
# Create uscities table
table_name = "uscities"
s3_private_path_cities = "s3://{}/ADS508_project/cities".format(bucket)

statement = """
CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
    city VARCHAR(45),
    state_id VARCHAR(2),
    state_name VARCHAR(30)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')
""".format(
    database_name, table_name, s3_private_path_cities
)

pd.read_sql(statement, conn)


### Parquets
#### Feature Selection and Creation
Prior creating a parquet table, tweeter and NYT comment tables are going to be combined under Comment Table in Athena where a feature creation is going to be performed. When the table is combined, we selected some features that would be benefit to our goal.

Selected features which could help on train the model are comment_body, user_location, candidatepoll

In [None]:
table_comment = "comment"
table_tweeter = "tweeter"
table_nyt_comment = "nyt_comment"

# SQL Statement combine tweeter and NYT comment in the preparation to create parquet
statement = """
CREATE TABLE IF NOT EXISTS {}.{} AS (
SELECT DISTINCT
    tweet_id AS comment_id,
    tweet AS comment_body,
    user_id AS user_id,
    user_name AS user_name,
    user_location AS user_location,
    u.state_id AS user_state,
    created_at AS create_date,
    retweet_count AS reply_retweet,
    likes AS recommendation_like,
    'tweet' as source,
    (LENGTH(tweet) - LENGTH(REPLACE(LOWER(tweet), 'trump', ''))) / LENGTH('trump') AS trump_count,
    (LENGTH(tweet) - LENGTH(REPLACE(LOWER(tweet), 'biden', ''))) / LENGTH('biden') AS biden_count,
    CASE 
        WHEN 
            (LENGTH(tweet) - LENGTH(REPLACE(LOWER(tweet), 'biden', ''))) / LENGTH('biden') > 
            (LENGTH(tweet) - LENGTH(REPLACE(LOWER(tweet), 'trump', ''))) / LENGTH('trump') 
        THEN 'Biden'
        WHEN              
            (LENGTH(tweet) - LENGTH(REPLACE(LOWER(tweet), 'biden', ''))) / LENGTH('biden') < 
            (LENGTH(tweet) - LENGTH(REPLACE(LOWER(tweet), 'trump', ''))) / LENGTH('trump') 
        THEN 'Trump'
        ELSE NULL
    END AS candidatepoll
FROM
    {}.{}
LEFT JOIN
    dbpoliticpulsecomment.uscities u ON lower(user_location) = lower(u.city) OR 
    lower(user_location) = lower(u.state_name) OR 
    lower(user_location) = lower(u.state_id)
WHERE
    LENGTH(tweet) > 3
UNION
SELECT DISTINCT
    commentid AS comment_id,
    commentbody AS comment_body,
    userID AS user_id,
    userDisplayName AS user_name,
    userLocation AS user_location,
    u.state_id as user_state,
    createDate AS create_date,
    replyCount AS reply_retweet,
    recommendation AS recommendation_like,
    'nyt_comment' as source,
    (LENGTH(commentbody) - LENGTH(REPLACE(LOWER(commentbody), 'trump', ''))) / LENGTH('trump') AS trump_count,
    (LENGTH(commentbody) - LENGTH(REPLACE(LOWER(commentbody), 'biden', ''))) / LENGTH('biden') AS biden_count,
    CASE
        WHEN
            (LENGTH(commentbody) - LENGTH(REPLACE(LOWER(commentbody), 'biden', ''))) / LENGTH('biden') > 
            (LENGTH(commentbody) - LENGTH(REPLACE(LOWER(commentbody), 'trump', ''))) / LENGTH('trump') 
        THEN 'Biden'
        WHEN
            (LENGTH(commentbody) - LENGTH(REPLACE(LOWER(commentbody), 'biden', ''))) / LENGTH('biden') < 
            (LENGTH(commentbody) - LENGTH(REPLACE(LOWER(commentbody), 'trump', ''))) / LENGTH('trump') 
        THEN 'Trump'
        ELSE NULL
    END AS candidatepoll
FROM 
    {}.{}
LEFT JOIN
    dbpoliticpulsecomment.uscities u ON lower(userLocation) = lower(u.city) OR 
    lower(userLocation) = lower(u.state_name) OR 
    lower(userLocation) = lower(u.state_id)
WHERE LENGTH(commentbody) > 3
)""".format(database_name, table_comment,database_name, table_tweeter,database_name, table_nyt_comment)

pd.read_sql(statement, conn)

In [None]:
commentbody = "Joe Biden"
source = "nyt_comment"

statement = """SELECT * FROM {}.{}
    WHERE comment_body like '{}%' and source = '{}' LIMIT 100""".format(
    database_name, table_comment, commentbody, source
)

df = pd.read_sql(statement, conn)
df.head(5)


In [None]:
# Setup to create Parquet
ingest_create_athena_table_parquet_passed = False

# Set S3 path to Parquet data
s3_path_parquet = "s3://{}/ADS508_project/parquet".format(bucket)

table_parquet = "comment_parquet"

In [None]:
# SQL statement to execute (remove comment_id, user_name,)
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['candidatepoll']) AS
SELECT DISTINCT user_location,
         user_state,
         comment_body,
         source,
         candidatepoll
FROM {}.{}
where candidatepoll is not null AND Length(comment_body)>1 AND length(user_location)>1""".format(
    database_name, table_parquet, s3_path_parquet, database_name, table_comment
)
pd.read_sql(statement, conn)

In [None]:
#load the Parquet partitions
statement = "MSCK REPAIR TABLE {}.{}".format(database_name, table_parquet)

df = pd.read_sql(statement, conn)
df.head(5)

In [None]:
statement = "SHOW PARTITIONS {}.{}".format(database_name, table_parquet)
df_partitions = pd.read_sql(statement, conn)
df_partitions.head(5)

In [None]:
statement = "SHOW TABLES in {}".format(database_name)
df_partitions = pd.read_sql(statement, conn)
df_partitions.head(5)

In [None]:
candidatepoll = "Trump"

statement = """SELECT * FROM {}.{}
    Where candidatepoll = '{}' LIMIT 10""".format(
    database_name, table_parquet, candidatepoll
)
df_parquet = pd.read_sql(statement, conn)
df_parquet.head(5)

## Query Data

In [None]:
# Setup and check pre-requisites to create Database

!pip install --disable-pip-version-check -q awswrangler
import awswrangler as wr

In [None]:
t_filter = lambda x: x["candidatepoll"] == "Trump"
b_filter = lambda x: x["candidatepoll"] == "Biden"

In [None]:
path =  "s3://{}/ADS508_project/parquet".format(bucket)

df_parquet_results = wr.s3.read_parquet(
    path, columns=["comment_body", "source", "user_location", "candidatepoll"], partition_filter=t_filter, dataset=True
)

#"user_state", 


In [None]:
df_parquet_results_trump = df_parquet_results.drop_duplicates(subset = ['comment_body'])
df_parquet_results_trump.shape
df_parquet_results_trump.head(5)

In [None]:
path =  "s3://{}/ADS508_project/parquet".format(bucket)

# Call all public discourse data from 
df_parquet_results = wr.s3.read_parquet(
    path, columns=["comment_body", "source", "user_location", "candidatepoll"], partition_filter=b_filter, dataset=True
)

# "user_state",
df_parquet_results_biden = df_parquet_results.drop_duplicates(subset = ['comment_body'])
df_parquet_results_biden.shape
df_parquet_results_biden.head(5)

# Data Exploration before Any Feature Transformation

In [None]:
# Checking for missing values from trump data
df_parquet_results_trump.info()

In [None]:
# Check for missing values from  biden data
df_parquet_results_biden.info()

In [None]:
# Data Distributions for Trump and Biden 
df_parquet_results_trump['source'].value_counts()


In [None]:
# Data Distributions for Trump and Biden 
df_parquet_results_biden['source'].value_counts()

In [None]:
# Filtering Data by Location 
df_parquet_results_trump['user_location'].value_counts()

In [None]:
# Top 30 locations of social media activity
df_parquet_results_trump['user_location'].value_counts().head(30)

In [None]:
df_parquet_results_biden['user_location'].value_counts()

In [None]:
# Top 30 locations of social media activity
df_parquet_results_biden['user_location'].value_counts().head(30)

# Create EDA Overview Data Visualization

In [None]:
# Data Sources for Each Candidate
# Code from ChatGPT

import matplotlib.pyplot as plt

candidate = ['Trump','Biden']
twitter_n = [df_parquet_results_trump['source'].value_counts()[0],df_parquet_results_biden['source'].value_counts()[0]]
nyt_n = [df_parquet_results_trump['source'].value_counts()[1],df_parquet_results_biden['source'].value_counts()[1]]

x = range(len(candidate))

# Plotting the bars
plt.bar(x, twitter_n, width=0.4, label='Twitter Mentions', align='center')
plt.bar(x, nyt_n, width=0.4, label='NYT Mentions', align='edge')

# Adding labels
plt.xlabel('Candidates')
plt.ylabel('Number of Mentions')
plt.title('Twitter and NYT Mentions by Candidate')
plt.xticks(x, candidate)
plt.legend()

# Show plot
plt.savefig('DataDist.png')

## Standardizing Text Entries and Location Data

In [None]:
!pip install emoji

In [None]:
import re
import emoji

# Function to preprocess text data
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Replace emojis with text labels
    text = emoji.demojize(text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove "#" from hashtags
    text = re.sub(r'#', '', text)
    # Remove "@" from mentions
    text = re.sub(r'@', '', text)
    # Remove extra spaces while preserving at least one space between words
    text = re.sub(r'\s+', ' ', text)
    # Check if the text consists only of numbers
    if text.isdigit():
        text = "NA"
    return text



In [None]:
df_parquet_results_biden['user_location'] = df_parquet_results_biden['user_location'].astype(str)
df_parquet_results_trump['user_location'] = df_parquet_results_trump['user_location'].astype(str)

In [None]:
# Apply preprocessing to text columns in the dataframe
df_parquet_results_biden['clean_text'] = df_parquet_results_biden['comment_body'].apply(preprocess_text)

In [None]:
df_parquet_results_biden['clean_location'] = df_parquet_results_biden['user_location'].apply(preprocess_text)

In [None]:
df_parquet_results_trump['clean_text'] = df_parquet_results_trump['comment_body'].apply(preprocess_text)

In [None]:
df_parquet_results_trump['clean_location'] = df_parquet_results_trump['user_location'].apply(preprocess_text)

In [None]:
df_parquet_results_trump.head()

In [None]:
df_parquet_results_biden.head()

# Word Clouds with Comment Body Text

In [None]:
!pip install WordCloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

# Sample a subset of comments for generating the word cloud
sampled_trump_comments = df_parquet_results_trump['clean_text'].dropna().sample(n=10000, random_state=42)

# Concatenate sampled comments for Trump
trump_comments_text = ' '.join(sampled_trump_comments)

# Define words and patterns to exclude
words_to_exclude = ['trump', 'donald', 'donaldtrump', 'https', 'co', 'st'] 
patterns_to_exclude = [r'\btrump\b', r'\bdonald\b', r'\bdonaldtrump\b', r'https?://\S+']

# Combine words and patterns to exclude
exclude_patterns = '|'.join(words_to_exclude + patterns_to_exclude)

# Preprocess text to remove specific words and patterns
trump_comments_text_cleaned = re.sub(exclude_patterns, '', trump_comments_text, flags=re.IGNORECASE)

# Generate word cloud for Trump comments with a limit on the number of words
wordcloud_trump = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(trump_comments_text_cleaned)

# Plot the word cloud for Trump
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_trump, interpolation='bilinear')
plt.title('Word Cloud for Trump Comments (Excluding "Trump" and "Donald")')
plt.axis('off')
plt.show()

In [None]:
# Sample a subset of comments for Biden
sampled_biden_comments = df_parquet_results_biden['clean_text'].dropna().sample(n=40000, random_state=42)

# Concatenate sampled comments for Biden
biden_comments_text = ' '.join(sampled_biden_comments)

# Define words and patterns to exclude
words_to_exclude = ['biden', 'joe', 'joebiden', 'president', 'election', 'https', 'co','amp']  # Exclude common words and URLs
patterns_to_exclude = [r'\bbiden\b', r'\bjoe\b', r'\bjoebiden\b', r'https?://\S+']

# Combine words and patterns to exclude
exclude_patterns = '|'.join(words_to_exclude + patterns_to_exclude)

# Preprocess text to remove specific words and patterns
biden_comments_text_cleaned = re.sub(exclude_patterns, '', biden_comments_text, flags=re.IGNORECASE)

# Generate word cloud for Biden comments with a limit on the number of words
wordcloud_biden = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(biden_comments_text_cleaned)

# Plot the word cloud for Biden
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_biden, interpolation='bilinear')
plt.title('Word Cloud for Biden Comments (Excluding "Biden" and "Joe")')
plt.axis('off')
plt.show()

# Feature Transformation and Data Cleaning

### Sentiment Labels Using Textblob and Stripping Out Non-English Text Entries with Fasttext.

### This code must be ran on Mac/Linux only. Windows is not compatible. Clean data from this code is stored in an S3 bucket and called below.

In [None]:
!pip install textblob
from textblob import TextBlob
def analyze_tweet_sentiment(tweet_text):
    analysis = TextBlob(tweet_text)
    # Return sentiment polarity (ranging from -1 to 1)
    return analysis.sentiment.polarity

In [None]:
# Apply sentiment labels to Trump Data
if __name__ == "__main__":
    # Assuming you have a DataFrame named df_tweets with a column 'text' containing tweet text
    df_tweets_trump = df_parquet_results_trump[['clean_text']]
    # Analyze sentiment for each tweet in the DataFrame
    df_tweets_trump['sentiment_score'] = df_tweets_trump['clean_text'].apply(analyze_tweet_sentiment)

# Add sentiment output to parent dataframe
df_parquet_results_trump['sentiment'] = df_tweets_trump['sentiment_score']

In [None]:
 # Apply Sentiment labels to Biden Data       
if __name__ == "__main__":
    # Assuming you have a DataFrame named df_tweets with a column 'text' containing tweet text
    df_tweets_biden = df_parquet_results_biden[['clean_text']]
    # Analyze sentiment for each tweet in the DataFrame
    df_tweets_biden['sentiment_score'] = df_tweets_biden['clean_text'].apply(analyze_tweet_sentiment)

# Add sentiment output to parent dataframe
df_parquet_results_biden['sentiment'] = df_tweets_biden['sentiment_score']


In [None]:
df_parquet_results_trump.head(5)

In [None]:
df_parquet_results_biden.head(5)

## Combine candidate data

In [None]:
df_combined = pd.concat([df_parquet_results_biden, df_parquet_results_trump], ignore_index=True)

### Create sentiment labels

In [None]:
df_combined['sentiment_category'] = df_combined['sentiment'].apply(lambda x: 'negative' if x < 0 else ('positive' if x > 0 else 'neutral'))

In [None]:

df_combined.head()


In [None]:
df_combined.shape

# Additional EDA with Clean Language Data

## Trump Data

In [None]:
# Examine Trump Sentiment distribution
df_combined[df_combined['candidatepoll'] == 'Trump']['sentiment'].value_counts()

In [None]:
df_combined[(df_combined['candidatepoll'] == 'Trump') & (df_combined['sentiment'] < 0)].head(5)

In [None]:
df_combined[(df_combined['candidatepoll'] == 'Trump') & (df_combined['sentiment'] == 0)].head(5)

In [None]:
df_combined[(df_combined['candidatepoll'] == 'Trump') & (df_combined['sentiment'] > 0)].head(5)

In [None]:
plt.hist(df_combined[df_combined['candidatepoll'] == 'Trump']['sentiment'], bins=30, edgecolor='black')

# Adding labels
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Scores for Trump')

# Show plot
plt.savefig('TrumpSentDist.png')

### Biden Data

In [None]:
df_combined[df_combined['candidatepoll'] == 'Biden']['sentiment'].value_counts()

In [None]:
df_combined[(df_combined['candidatepoll'] == 'Biden') & (df_combined['sentiment'] < 0)].head(5)

In [None]:
df_combined[(df_combined['candidatepoll'] == 'Biden') & (df_combined['sentiment'] == 0)].head(5)

In [None]:
df_combined[(df_combined['candidatepoll'] == 'Biden') & (df_combined['sentiment'] > 0)].head(5)

In [None]:
plt.hist(df_combined[df_combined['candidatepoll'] == 'Biden']['sentiment'], bins=30, edgecolor='black')

# Adding labels
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Scores for Biden')

# Show plot
plt.savefig('BidenSentDist.png')

## Data Source Proportions with Clean Data

In [None]:
candidate = ['Trump','Biden']
twitter_n = [df_combined[df_combined['candidatepoll'] == 'Trump']['source'].value_counts()[0], 
             df_combined[df_combined['candidatepoll'] == 'Biden']['source'].value_counts()[0]]
nyt_n = [df_combined[df_combined['candidatepoll'] == 'Trump']['source'].value_counts()[1], 
         df_combined[df_combined['candidatepoll'] == 'Biden']['source'].value_counts()[1]]

x = range(len(candidate))

# Plotting the bars
plt.bar(x, twitter_n, width=0.4, label='Twitter Mentions', align='center')
plt.bar(x, nyt_n, width=0.4, label='NYT Mentions', align='edge')

# Adding labels
plt.xlabel('Candidates')
plt.ylabel('Number of Mentions')
plt.title('Twitter and NYT Mentions by Candidate')
plt.xticks(x, candidate)
plt.legend()

# Show plot
plt.savefig('DataDist.png')

# Additional Feature Engineering

## Create Outcome Variable

In [None]:
df_combined['SentimentOutcome'] = df_combined['candidatepoll']+"_"+df_combined['sentiment_category']

In [None]:
import matplotlib.pyplot as plt
category_counts = df_combined['SentimentOutcome'].value_counts()
plt.bar(category_counts.index, category_counts.values)
plt.xlabel('Candidate Sentiment Categories')
plt.xticks(rotation=45)  # Rotate x-axis labels by 45 degrees
plt.ylabel('Count')
plt.title('Frequency of Each Possible Sentiment Outcome')
plt.show()

## Smallest category is Biden_negative. When using train/test/split with 90/5/5 ratio, we will need balancing.

In [None]:
 df_combined[df_combined['candidatepoll'] == 'Biden']['SentimentOutcome'].value_counts()

In [None]:
 df_combined[df_combined['candidatepoll'] == 'Trump']['SentimentOutcome'].value_counts()

In [None]:
df_combined.head(5)


# Save Data to S3 bucket

In [None]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "cleandata.csv"

# Create an S3 client
s3 = boto3.client('s3')

#refine df_combined
df_combined_ref = df_combined[['source', 'clean_text', 'clean_location', 'candidatepoll', 'SentimentOutcome']]
df_combined_ref.head()

In [None]:
# Upload each file to S3
df_combined_ref.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [None]:
# Create temp table of df_combined
temp_table = "temp_table"
s3_private_path_cleandata = "s3://{}/ADS508_project/cleandata".format(bucket)

statement = """
CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
    source VARCHAR(20),
    clean_text VARCHAR(250),
    clean_location VARCHAR(25),
    candidatepoll VARCHAR(10),
    SentimentOutcome VARCHAR(10)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')
""".format(
    database_name, temp_table, s3_private_path_cleandata
)

pd.read_sql(statement, conn)

In [None]:
# map df_combined/ cleandata with uscities by city
temp_table = "temp_table_city"
import time
from datetime import datetime
from time import strftime
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")

statement = """
CREATE TABLE IF NOT EXISTS {}.{} AS (
SELECT DISTINCT 
    t.source, 
    t.clean_text, 
    t.clean_location, 
    t.candidatepoll, 
    t.SentimentOutcome, 
    u.state_id
    FROM  dbpoliticpulsecomment.temp_table t
    LEFT JOIN dbpoliticpulsecomment.uscities u
        ON lower(t.clean_location) = lower(u.city)
    where state_id <> ''
)
""".format(database_name, temp_table)
pd.read_sql(statement, conn)

timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"end: {timestamp}\n")

In [None]:
# map df_combined/ cleandata with uscities by statename
temp_table = "temp_table_statename"
import time
from datetime import datetime
from time import strftime
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")

statement = """
CREATE TABLE IF NOT EXISTS {}.{} AS (
SELECT DISTINCT 
    t.source, 
    t.clean_text, 
    t.clean_location, 
    t.candidatepoll, 
    t.SentimentOutcome, 
    u.state_id
    FROM  dbpoliticpulsecomment.temp_table t
    LEFT JOIN dbpoliticpulsecomment.uscities u
        ON lower(t.clean_location) = lower(u.state_name)
    where state_id <> ''
)
""".format(database_name, temp_table)
pd.read_sql(statement, conn)

timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"end: {timestamp}\n")

In [None]:
# map df_combined/ cleandata with uscities by stateid
temp_table = "temp_table_stateid"
import time
from datetime import datetime
from time import strftime
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")

statement = """
CREATE TABLE IF NOT EXISTS {}.{} AS (
SELECT DISTINCT 
    t.source, 
    t.clean_text, 
    t.clean_location, 
    t.candidatepoll, 
    t.SentimentOutcome, 
    u.state_id
    FROM  dbpoliticpulsecomment.temp_table t
    LEFT JOIN dbpoliticpulsecomment.uscities u
        ON lower(t.clean_location) = lower(u.state_id)
    where state_id <> ''
)
""".format(database_name, temp_table)
pd.read_sql(statement, conn)

timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"end: {timestamp}\n")

In [None]:
# Create a combine of temp_table_state_id, _statename, _city
table_name = "cleandata"
import time
from datetime import datetime
from time import strftime
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")

statement = """
CREATE TABLE IF NOT EXISTS {}.{} AS (
SELECT DISTINCT 
    source, 
    clean_text, 
    clean_location, 
    candidatepoll, 
    SentimentOutcome, 
    state_id
    FROM  dbpoliticpulsecomment.temp_table_stateid
UNION
SELECT DISTINCT 
    source, 
    clean_text, 
    clean_location, 
    candidatepoll, 
    SentimentOutcome, 
    state_id
    FROM  dbpoliticpulsecomment.temp_table_city
UNION
SELECT DISTINCT 
    source, 
    clean_text, 
    clean_location, 
    candidatepoll, 
    SentimentOutcome, 
    state_id
    FROM  dbpoliticpulsecomment.temp_table_statename
)
""".format(database_name, table_name)
pd.read_sql(statement, conn)
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"end: {timestamp}\n")

In [None]:
# Drop all unused tables from Athena
drop_table_names = ["temp_table", "temp_table_statename", "temp_table_stateid", "temp_table_city"]

for table_name in drop_table_names:
    table_name = table_name

    statement = """
    DROP TABLE {}.{}
    """.format(database_name, table_name)
    
    pd.read_sql(statement, conn)

In [None]:
#==> SHOULD CREATE ANOTHER PARQUET OR READING FROM THIS? 
table_name = "cleandata"
statement = """SELECT * FROM {}.{}""".format(
    database_name, table_name
)
df_combined_clean = pd.read_sql(statement, conn)
df_combined_clean.head(5)

In [None]:
df_combined_clean.shape

# Examine State IDs by Outcome

In [None]:
df_final = df_combined_clean[['clean_text', 'state_id', 'sentimentoutcome']]

In [None]:
df_final['state_id'].value_counts()

In [None]:
# Step 1: Filter the DataFrame for the top 10 states
top_5_states = df_final['state_id'].value_counts().head(5).index
filtered_df = df_final[df_final['state_id'].isin(top_5_states)]

# Step 2: Create a cross-tabulation (crosstab) with 'state_id' and 'sentimentoutcome'
cross_tab = pd.crosstab(filtered_df['state_id'], filtered_df['sentimentoutcome'])

# Display the cross-tabulation
print(cross_tab)

In [None]:
df_final.head()

# Upload Final Data to S3

In [None]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "final_data.csv"

# Create an S3 client
s3 = boto3.client('s3')

df_final.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

# Load Final Data from S3 Bucket for Modeling

In [4]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/final_data.csv'

# Read CSV file from S3 bucket into DataFrame
df_final = pd.read_csv(f's3://{bucket_name}/{file_path}')

df_final.head()

Unnamed: 0,clean_text,state_id,sentimentoutcome
0,H A Hyde Agreed I wonder how many trumpers kn...,NY,Trump_posi
1,FunkyIrishman I would love that but if SCOTUS...,LA,Trump_posi
2,Remember when Trump said What have you got to...,CT,Trump_neut
3,Seems to me that current events mean that Tru...,CA,Trump_nega
4,One of the reasons we have Donald Trump in off...,NJ,Trump_posi


In [5]:
df_final.shape

(330248, 3)

## Autopilot 

In [None]:
# # Sample about 10% of the data without replacement ---just to test------
# df_sampled = df_final.sample(frac=0.1, random_state=42)
# # Save the sampled DataFrame to a CSV file, without the index
# df_sampled.to_csv("df_sampled.csv", index=False)

In [None]:
# Save the DataFrame to a CSV file, without the index
df_final.to_csv("df_final.csv", index=False)

In [None]:
import boto3
from botocore.exceptions import NoCredentialsError

bucket_name = '508group'  # Replace with your bucket name
file_name = "df_final.csv"  
key = 'ADS508_project/cleandata/df_final.csv'  

try:
    s3.upload_file(file_name, bucket_name, key)
    print(f"File uploaded successfully to s3://{bucket_name}/{key}")
except FileNotFoundError:
    print("The file was not found")
except NoCredentialsError:
    print("Credentials not available")

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.automl.automl import AutoML

role = get_execution_role()
session = sagemaker.Session()

# Replace 'your_dataset.csv' with the name of your CSV file
input_data = 's3://508group/ADS508_project/cleandata/df_final.csv'

autopilot_job = AutoML(
    role=role,
    target_attribute_name='sentimentoutcome',  # This is the column you're predicting
    output_path='s3://508group/ADS508_project/output/autopilot/',
    max_candidates=20,
    sagemaker_session=session,
    problem_type='MulticlassClassification',  # Assuming sentimentoutcome is multiclass
    job_objective={'MetricName': 'Accuracy'}  # You can choose another metric if it fits better
)

autopilot_job.fit(inputs=input_data, wait=False, logs=True)


In [None]:
import boto3
import tarfile
import os

# Initialize the S3 client
s3 = boto3.client('s3')

# S3 bucket and object key
bucket_name = '508group'
object_key = 'ADS508_project/output/autopilot/automl-2024-03-30-19-51-00-981/data-processor-models/automl-2024-03-30-19-51-00-981-dpp9-1-89acfb1b856c4754873c58ddd/output/model.tar.gz'

# Local directory to extract the contents
extract_dir = "ADS508_project/output/autopilot/automl-2024-03-30-19-51-00-981/data-processor-models/automl-2024-03-30-19-51-00-981-dpp9-1-89acfb1b856c4754873c58ddd/output/model.tar.gz"

# Create the directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Download the file from S3
local_file_path = os.path.join(extract_dir, 'model.tar.gz')
s3.download_file(bucket_name, object_key, local_file_path)

# Open the tar file and extract the contents
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)

# After extraction, you can further process or use the contents of the extracted files
# For example, list the extracted files
extracted_files = os.listdir(extract_dir)
print("Extracted files:", extracted_files)


In [None]:
# pip install -U scikit-learn

In [None]:
# Download the file from S3
local_file_path = os.path.join(extract_dir, 'model.tar.gz')
s3.download_file(bucket_name, object_key, local_file_path)

# Extract the contents of the tar.gz file
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)



In [17]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.automl.automl import AutoML
import time
from datetime import datetime
from time import strftime
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")
role = get_execution_role()
session = sagemaker.Session()
# Replace 'your_dataset.csv' with the name of your CSV file
#input_data = 's3://508group/ADS508_project/cleandata/df_final.csv'
input_data = 's3://sagemaker-us-east-1-851725336500/ADS508_project/cleandata/final_data.csv'
autopilot_job = AutoML(
    role=role,
    target_attribute_name='sentimentoutcome',  # This is the column you're predicting
    #output_path='s3://508group/ADS508_project/output/autopilot/',
    output_path= 's3://sagemaker-us-east-1-851725336500/ADS508_project/output/autopilot/',
    max_candidates=20,
    sagemaker_session=session,
    problem_type='MulticlassClassification',  # Assuming sentimentoutcome is multiclass
    job_objective={'MetricName': 'Accuracy'}  # You can choose another metric if it fits better
)
autopilot_job.fit(inputs=input_data, wait=True, logs=True)
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(f"start: {timestamp}\n")


start: 2024-04-01T18:34:14Z

.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
..start: 2024-04-01T20:16:24Z



# Top Model Object Keys

In [None]:
# XGBoost 1: Key Term "R-002"
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3
s3://sagemaker-us-east-1-851725336500/ADS508_project/output/autopilot/automl-2024-04-01-18-34-15-211/tuning/automl-202-dpp6-xgb/automl-2024-04-01-18-34-15-211xR-002-6a1fdd29/output/model.tar.gz

In [None]:
# R-001
Name
automl-2024-04-01-18-34-15-211xR-001-416b37f8-aws-training-job
ARN
arn:aws:sagemaker:us-east-1:851725336500:experiment-trial-component/automl-2024-04-01-18-34-15-211xR-001-416b37f8-aws-training-job

#Image URI
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.3-1-cpu-py3

# Instance Info - 1 instance - 50GB
ml.m5.12xlarge

#Lambda
0.015482990568956176

#Max depth
6

#Min child weight
0.0016695027194522027

#Num class
6

#Num round
967

#Objective
multi:softprob

# Model artifact
s3://sagemaker-us-east-1-851725336500/ADS508_project/output/autopilot/automl-2024-04-01-18-34-15-211/tuning/automl-202-dpp7-xgb/automl-2024-04-01-18-34-15-211xR-001-416b37f8/output/model.tar.gz

In [None]:
# pip install --upgrade numpy scipy

# Data Partition

### Assign input and outcome variables

In [6]:
X = df_final[['clean_text', 'state_id']]
y = df_final['sentimentoutcome']

### Create a 90/5/5 data split

In [7]:
from sklearn.model_selection import train_test_split

# Split data into 90% train and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# Split remaining data (90% train) into 90% train and 10% validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/9, random_state=1)

### Further undersample the training set so that each level of the outcome variable is equal

In [8]:
#!pip install imblearn
from imblearn.under_sampling import RandomUnderSampler

# Undersample the training set to balance the six categorical outcome levels
undersampler = RandomUnderSampler(sampling_strategy='all', random_state=1)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

## Verify split datasets size and datatypes

In [9]:
X_train_resampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43860 entries, 53437 to 97982
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  43860 non-null  object
 1   state_id    43860 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [10]:
# Update Datatypes
X_train_resampled['clean_text'] = X_train_resampled['clean_text'].astype(str)
X_train_resampled['state_id'] = X_train_resampled['state_id'].astype('category')

In [11]:
X_train_resampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43860 entries, 53437 to 97982
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   clean_text  43860 non-null  object  
 1   state_id    43860 non-null  category
dtypes: category(1), object(1)
memory usage: 730.6+ KB


In [12]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33025 entries, 124161 to 206753
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  33025 non-null  object
 1   state_id    33025 non-null  object
dtypes: object(2)
memory usage: 774.0+ KB


In [13]:
# Update Datatypes
X_test['clean_text'] = X_test['clean_text'].astype(str)
X_test['state_id'] = X_test['state_id'].astype('category')

In [14]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33025 entries, 237346 to 197012
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clean_text  33025 non-null  object
 1   state_id    33025 non-null  object
dtypes: object(2)
memory usage: 774.0+ KB


In [15]:
# Update Datatypes
X_val['clean_text'] = X_val['clean_text'].astype(str)
X_val['state_id'] = X_val['state_id'].astype('category')

In [16]:
X_train_resampled.shape, X_test.shape, X_val.shape

((43860, 2), (33025, 2), (33025, 2))

In [17]:
X_train_resampled.head()

Unnamed: 0,clean_text,state_id
53437,If his base can still hate Biden after all th...,NY
113899,Biden didn t run against socialists There wer...,PA
97700,RepDougCollins Oh screw off PresidentElect Jo...,IL
106233,Lynn Biden voted for the bankruptcy bill that...,PA
160694,Joe Biden I believe being accountable means h...,OH


In [18]:
X_test.head()

Unnamed: 0,clean_text,state_id
124161,As Trump rids himself and the country of resp...,MA
3351,Joe Biden needs to sign up for anti racism tra...,NY
291818,I saw a Trump banner on a lawn the other day,MA
98845,Trump was caught by the virus and his incompet...,GA
207321,Trump s lawyers saw this coming Now we know wh...,VA


In [19]:
X_val.head()

Unnamed: 0,clean_text,state_id
237346,The attempt to cover up the latest covid 19 v...,RI
57321,In defense of at least some Trump supporters,CO
276637,The only remote chance of this happening is i...,TX
10263,Trump is seemingly having a meltdown which wi...,CO
313099,I filled out my ballot for Bloomberg but then...,CO


## Verify balance of training set

In [20]:
y_train_resampled.value_counts()

Biden_posi    7310
Trump_posi    7310
Biden_neut    7310
Trump_neut    7310
Trump_nega    7310
Biden_nega    7310
Name: sentimentoutcome, dtype: int64

## Average length of comments. 

In [None]:
# Function to count string word length
def count_word_length(text):
    words = text.split()
    return len(words)

# Add column to DataFrame
df_final['word_length'] = df_final['clean_text'].apply(lambda x: count_word_length(x))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
median_word_length = np.median(df_final['word_length'])

# Plot histogram of word lengths
plt.hist(df_final['word_length'], bins=range(min(df_final['word_length']), max(df_final['word_length']) + 1), edgecolor='black')
plt.axvline(median_word_length, color='red', linestyle='dashed', linewidth=1)  # Add vertical line at median
plt.text(median_word_length, plt.ylim()[1] * 0.9, f'Median: {median_word_length:.2f}', color='red')  # Add label for median
plt.xlabel('Word Length')
plt.ylabel('Frequency')
plt.title('Histogram of Word Lengths')
plt.grid(True)
plt.show()

# Text Entry Preprocessing and Data Transformation Pipeline: Will Need to Add Categorical One-Hot Encoding if we use Location Data

In [None]:
# Fit and transform on the training set
#X_train_transformed_text = pd.DataFrame(text_pipeline.fit_transform(X_train_resampled['clean_text']))

In [None]:
#X_train_transformed_state = pd.DataFrame(cat_pipeline.fit_transform(X_train_resampled))
#X_train_transformed_state.columns = ['state_id']

In [None]:
#X_train_transformed_text.head(2)

In [None]:
#X_train_transformed_state.head()

In [None]:
#X_train = pd.concat([X_train_transformed_text, X_train_transformed_state], axis=1)

## Modeling 

In [46]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [None]:
# pip install transformers

### Trying a different Preprocess, since I was having trouble with Word2Vec (This step ignores anything after word2vec)

### First function using "last_hidden_state" which did not work well with my PC

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np

class DistilBERTTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialize the DistilBERT tokenizer and model
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.model.eval()  # Set model to evaluation mode
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, batch_size=32):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        embeddings = []
        for i in tqdm(range(0, len(X), batch_size)):
            batch = X[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy())  # Use mean pooling
        return np.vstack(embeddings)

### Second function directly appends outputs to embeddings without using last_hidden_state. Also shorted max_length of input characters to 64 to reduce computational overload.

In [25]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np

class DistilBERTTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialize the DistilBERT tokenizer and model
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.model.eval()  # Set model to evaluation mode
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, batch_size=32):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        embeddings = []
        for i in tqdm(range(0, len(X), batch_size)):
            batch = X[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            # Adjust based on the actual structure of the outputs
            embeddings.append(outputs[0].mean(dim=1).detach().cpu().numpy())  # Use mean pooling
        return np.vstack(embeddings)

### BERT transformations

In [26]:
distilbert_transformer = DistilBERTTransformer()

In [21]:
# Transform 'clean_text' into DistilBERT embeddings for the training set
texts_train = X_train_resampled['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_train = distilbert_transformer.transform(texts_train)

100%|██████████| 1371/1371 [38:14<00:00,  1.67s/it]


In [49]:
# Transform 'clean_text' into DistilBERT embeddings for the validation set
texts_val = X_val['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_val = distilbert_transformer.transform(texts_val)

100%|██████████| 1033/1033 [29:03<00:00,  1.69s/it]


In [27]:
# Transform 'clean_text' into DistilBERT embeddings for the test set
texts_test = X_test['clean_text'].tolist()  # Extract texts as a list
distilbert_embeddings_test = distilbert_transformer.transform(texts_test)


100%|██████████| 1033/1033 [28:51<00:00,  1.68s/it]


In [22]:
distilbert_embeddings_train = pd.DataFrame(distilbert_embeddings_train)

In [50]:
distilbert_embeddings_val = pd.DataFrame(distilbert_embeddings_val)

In [28]:
distilbert_embeddings_test = pd.DataFrame(distilbert_embeddings_test)

### Store Transformed BERT Dats

In [23]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_train_bert.csv"

# Create an S3 client
s3 = boto3.client('s3')

distilbert_embeddings_train.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [51]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_val_bert.csv"

# Create an S3 client
s3 = boto3.client('s3')

distilbert_embeddings_val.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

In [29]:
# Specify the S3 bucket name
prefix = "ADS508_project/cleandata/"

# List of corresponding file names
file_name = "X_test_bert.csv"

# Create an S3 client
s3 = boto3.client('s3')

distilbert_embeddings_test.to_csv(file_name, index=False)
s3.upload_file(file_name, bucket, prefix + file_name)

# Modeling

## Create Datasets with just BERT Text Data and Outcome Variable: Retrieve Data From S3

In [4]:
# Add outcome column to training set
import pandas as pd
import boto3
from io import StringIO

# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_train_bert.csv'

# Read CSV file from S3 bucket into DataFrame
s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key=file_path)
X_train_bert = pd.read_csv(response['Body'])

# Add a new column to the DataFrame
X_train_bert['sentimentoutcome'] = pd.Categorical(y_train_resampled)

# Write the modified DataFrame to a CSV file
csv_buffer = StringIO()
X_train_bert.to_csv(csv_buffer, index=False)

# Upload the updated CSV file to the S3 bucket, replacing the original file
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

NameError: name 'y_train_resampled' is not defined

In [6]:
bucket

'sagemaker-us-east-1-851725336500'

In [5]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_train_bert.csv'

# Read CSV file from S3 bucket into DataFrame
X_train_bert = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_train_bert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,sentimentoutcome
0,0.095015,0.009549,0.009942,0.037959,0.117014,-0.110813,0.070921,0.102977,-0.001028,-0.204761,...,0.135476,0.210159,-0.067416,0.237532,0.094727,0.022339,0.05976,0.073708,-0.043815,Biden_nega
1,-0.030366,0.01138,-0.126178,0.100002,-0.143618,-0.003824,-0.047498,0.257969,-0.085199,-0.151125,...,0.081773,-0.080077,-0.029285,0.260058,-0.097784,0.064009,-0.055928,0.078552,0.03535,Biden_nega
2,-0.102536,-0.057877,0.31423,0.242468,-0.050349,0.041889,0.005117,0.176137,-0.220375,-0.076621,...,0.025393,0.044405,-0.114249,0.018909,-0.033712,-0.06428,0.033023,0.009731,0.182539,Biden_nega
3,-0.12393,-0.242957,-0.052864,0.018666,0.195159,-0.103762,-0.161701,0.194002,-0.289946,0.069238,...,-0.057393,0.021898,-0.096951,0.259217,-0.167901,0.003082,0.08263,0.058202,-0.008902,Biden_nega
4,-0.030342,0.037465,0.045135,0.089331,0.134947,-0.083918,0.220986,0.149148,0.01307,-0.272415,...,-0.028953,-0.048934,-0.170933,0.067988,0.075548,0.262833,0.062096,-0.076311,-0.024085,Biden_nega


In [31]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_val_bert.csv'

# Read CSV file from S3 bucket into DataFrame
s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key=file_path)
X_val_bert = pd.read_csv(response['Body'])

# Add a new column to the DataFrame
X_val_bert['sentimentoutcome'] = pd.Categorical(y_val)

# Write the modified DataFrame to a CSV file
csv_buffer = StringIO()
X_val_bert.to_csv(csv_buffer, index=False)

# Upload the updated CSV file to the S3 bucket, replacing the original file
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'P19XDWJKAZMRECFR',
  'HostId': 'svhXrZLGLIyDebehAB+nx8SlhwVuBC1QKj94HNeZECksI0r0f7pNczJ7AGUgoW6SVLGYEIR0veY/fwgREAwprpy1Z7pF6TKe',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'svhXrZLGLIyDebehAB+nx8SlhwVuBC1QKj94HNeZECksI0r0f7pNczJ7AGUgoW6SVLGYEIR0veY/fwgREAwprpy1Z7pF6TKe',
   'x-amz-request-id': 'P19XDWJKAZMRECFR',
   'date': 'Mon, 01 Apr 2024 03:00:17 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"7fdd686062d1e69f314a96c19a6b6284"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"7fdd686062d1e69f314a96c19a6b6284"',
 'ServerSideEncryption': 'AES256'}

In [7]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_val_bert.csv'

# Read CSV file from S3 bucket into DataFrame
X_val_bert = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_val_bert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,sentimentoutcome
0,-0.087282,0.015699,0.163972,-0.02312,0.160004,-0.31749,0.082573,0.487979,-0.132335,-0.163486,...,0.030014,-0.100873,-0.190175,0.11193,-0.114476,0.084147,-0.032507,0.089214,0.02656,Trump_posi
1,-0.072381,-0.14475,-0.076681,0.140001,0.18423,-0.107117,-0.071758,-0.061391,-0.001518,0.027018,...,0.139179,-0.009953,-0.039035,0.309489,-0.120442,0.107665,0.025695,-0.09399,0.230828,Trump_nega
2,-0.072684,-0.090894,0.43454,-0.000376,0.148673,-0.341517,0.052663,0.359947,-0.205352,-0.199319,...,0.145761,0.048037,-0.124676,0.020781,0.037728,0.027285,-0.238239,0.168148,0.078734,Biden_nega
3,-0.067866,0.101516,0.343076,0.094347,0.133979,-0.237478,0.147569,0.246141,0.057791,-0.107479,...,0.078337,0.029585,-0.024304,0.10173,0.087064,-0.085877,0.05797,0.013239,-0.048011,Trump_neut
4,-0.076768,-0.25942,0.07952,0.138926,0.019723,-0.047109,-0.144468,0.395104,-0.054171,-0.049444,...,0.271528,-0.114425,-0.26837,0.205715,0.008526,-0.055535,-0.014211,0.056284,0.278866,Biden_posi


In [33]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_test_bert.csv'

# Read CSV file from S3 bucket into DataFrame
s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key=file_path)
X_test_bert = pd.read_csv(response['Body'])

# Add a new column to the DataFrame
X_test_bert['sentimentoutcome'] = pd.Categorical(y_test)

# Write the modified DataFrame to a CSV file
csv_buffer = StringIO()
X_test_bert.to_csv(csv_buffer, index=False)

# Upload the updated CSV file to the S3 bucket, replacing the original file
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'QMBVAW4NA3HKZN5E',
  'HostId': 'eUlGAvB1Nq+R5W4RswzNTh8gKRq1D/n3do3uSgiOS09/kn7w/dKt38ECyHeyyvWJGZAZwoemaWapG+bNS5YAQOWvQb2uHHFl',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'eUlGAvB1Nq+R5W4RswzNTh8gKRq1D/n3do3uSgiOS09/kn7w/dKt38ECyHeyyvWJGZAZwoemaWapG+bNS5YAQOWvQb2uHHFl',
   'x-amz-request-id': 'QMBVAW4NA3HKZN5E',
   'date': 'Mon, 01 Apr 2024 03:02:24 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"01bddb3a97a16df883f728d1d924fdb7"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"01bddb3a97a16df883f728d1d924fdb7"',
 'ServerSideEncryption': 'AES256'}

In [8]:
# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_test_bert.csv'

# Read CSV file from S3 bucket into DataFrame
X_test_bert = pd.read_csv(f's3://{bucket_name}/{file_path}')

X_test_bert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,sentimentoutcome
0,0.038298,0.018444,0.140237,0.191257,0.191615,-0.263421,0.006985,-0.03152,-0.095174,-0.165052,...,0.337145,0.058811,-0.195464,0.168021,-0.040112,0.135232,-0.01793,0.122643,-0.00439,Trump_neut
1,0.192483,-0.083949,0.087277,-0.0194,0.208303,-0.224665,0.218658,0.155049,0.102606,-0.06323,...,0.037749,-0.052559,-0.053232,0.04576,0.028098,-0.132751,0.040613,-0.162681,0.134521,Biden_neut
2,0.167566,-0.217359,0.209112,0.152782,0.266532,-0.304838,-0.209045,0.306407,-0.087472,-0.098689,...,-0.022195,-0.006585,0.054734,0.29709,0.044051,-0.023526,0.083975,0.111322,0.0926,Trump_nega
3,-0.033481,-0.084976,-0.095893,0.171605,0.064353,-0.187916,-0.159794,0.2202,0.068491,-0.063123,...,0.092172,-0.012548,-0.290175,0.09739,0.006686,0.110492,0.078122,-0.05434,0.211628,Trump_neut
4,-0.002182,0.003377,0.365349,0.220605,0.20591,-0.351755,-0.186874,0.361799,-0.048657,-0.218578,...,-0.088876,0.063544,-0.233847,-0.013211,-0.09774,-0.11284,-0.019173,0.103474,-0.174167,Trump_nega


In [None]:
all_bert = pd.concat([X_tra

# Modeling

## Create Datasets with both BERT Text Data and One-Hot Encoded Location Data

In [35]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False) 

# Assuming X_train, X_val, X_test are your datasets
state_ids_train = X_train['state_id'].values.reshape(-1, 1)  # Reshape for the encoder
state_id_encoded_train = onehot_encoder.fit_transform(state_ids_train)



In [43]:
state_ids_val = X_val['state_id'].values.reshape(-1, 1)
state_id_encoded_val = onehot_encoder.fit_transform(state_ids_val)

state_ids_test = X_test['state_id'].values.reshape(-1, 1)
state_id_encoded_test = onehot_encoder.fit_transform(state_ids_test)

In [None]:
#Combine with sate ID that is one hot encoded
X_train_prepared = np.hstack((X_train_bert, state_id_encoded_train))
X_val_prepared = np.hstack((X_val_bert, state_id_encoded_val))
X_test_prepared = np.hstack((X_test_bert, state_id_encoded_test))



In [None]:
X_train_prepared.head()

In [None]:
X_val_prepared.head()

In [None]:
X_test_prepared.head()

### Drop C-1 Dummies for Baseline Linear Models (i.e., logistic regression)

In [None]:
X_train_both = X_train_prepared

In [None]:
X_val_both = X_val_prepared

In [None]:
X_test_both = X_test_prepared

# Test Whether Location Data Improves Performance 

## Baseline 1: Logistic Regression with only BERT Text Data

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
log_reg1 = LogisticRegression(max_iter=1000, solver='lbfgs')

# Fit the model on the balanced text training data
log_reg1.fit(X_train_bert, y_train_resampled)

# Predict on the validation set
y_val_pred = log_reg1.predict(X_val_bert)

# Evaluate the model
print("Baseline 1: Validation Set Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nBasline 1: Validation Set Classification Report:\n", classification_report(y_val, y_val_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Baseline 1: Validation Set Accuracy: 0.5997880393641181

Basline 1: Validation Set Classification Report:
               precision    recall  f1-score   support

  Biden_nega       0.34      0.61      0.44       908
  Biden_neut       0.70      0.70      0.70      1768
  Biden_posi       0.70      0.60      0.65      2425
  Trump_nega       0.46      0.58      0.51      7220
  Trump_neut       0.66      0.61      0.64      8599
  Trump_posi       0.68      0.59      0.63     12105

    accuracy                           0.60     33025
   macro avg       0.59      0.62      0.59     33025
weighted avg       0.62      0.60      0.61     33025



## Baseline 2: Logistic Regression with Text Data and Location (c-1 dummies)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
log_reg2 = LogisticRegression(max_iter=1000, solver='lbfgs')

# Fit the model on the balanced text training data
log_reg2.fit(X_train_both, y_train_resampled)

# Predict on the validation set
y_val_pred = log_reg2.predict(X_val_both)

# Evaluate the model
print("Baseline 2: Validation Set Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nBaselin 2: Validation Set Classification Report:\n", classification_report(y_val, y_val_pred))

# Neural Network Models (Can use all dummies)

In [54]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.automl.automl import AutoML

role = get_execution_role()
session = sagemaker.Session()

# Specify the S3 bucket name and file path
bucket_name = bucket
file_path = 'ADS508_project/cleandata/X_train_bert.csv'
input_data = s3://{bucket_name}/{file_path}'

autopilot_job = AutoML(
    role=role,
    target_attribute_name='sentimentoutcome',  # This is the column you're predicting
    output_path='s3://ADS508_project/output/autopilot/',
    max_candidates=15,
    sagemaker_session=session,
    problem_type='MulticlassClassification',  # Assuming sentimentoutcome is multiclass
    job_objective={'MetricName': 'Accuracy'}  # You can choose another metric if it fits better
)

autopilot_job.fit(inputs=input_data, wait=False, logs=True)

NameError: name 'ads508' is not defined

## MLP Model 1

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Use a pipeline to standardize features
mlp_pipeline = make_pipeline(
    StandardScaler(),
    MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, activation='relu', solver='adam', verbose=True, tol=0.001, alpha=0.0001)
)

# Fit the model
mlp_pipeline.fit(X_train_prepared, y_train)

# Predict and evaluate
y_val_pred = mlp_pipeline.predict(X_val_prepared)
print("Validation Set Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Set Classification Report:\n", classification_report(y_val, y_val_pred))


## MLP Model

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize the MLP Classifier

mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, activation='relu', solver='adam', verbose=True, tol=0.01)


# Fit the model on the training data
mlp.fit(X_train_prepared, y_train)

# Predict on the validation set
y_val_pred_mlp = mlp.predict(X_val_prepared)

# Evaluate the model
print("Validation Set Accuracy (MLP):", accuracy_score(y_val, y_val_pred_mlp))
print("\nValidation Set Classification Report (MLP):\n", classification_report(y_val, y_val_pred_mlp))


## Transformer 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Define device upfront
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming X_train_prepared, X_val_prepared, X_test_prepared and their respective y's are already defined

# Encode labels numerically
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "features": torch.tensor(self.features[idx], dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Dataset and DataLoader
train_dataset = CustomDataset(X_train_prepared, y_train_encoded)
val_dataset = CustomDataset(X_val_prepared, y_val_encoded)
test_dataset = CustomDataset(X_test_prepared, y_test_encoded)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Model Definition
class Classifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = Classifier(input_size=X_train_prepared.shape[1], num_classes=len(set(y_train_encoded)))
model.to(device)

# Training
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        features, labels = batch['features'].to(device), batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    

# Evaluation
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in test_loader:
        features, labels = batch['features'].to(device), batch['labels'].to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

print("Test Accuracy:", accuracy_score(y_true, y_pred))
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))



In [None]:
# import boto3
# import pandas as pd
# from io import StringIO

# # Create an S3 client
# s3 = boto3.client('s3')

# # Specify the name of the bucket
# bucket_name = '508group'

# # Ensure your DataFrame variables are defined here (X_train, y_train_resampled, X_test, y_test, X_val, y_val)

# # Define the DataFrames
# data_frames = {
#     'X_train': X_train,
#     'y_train_resampled': y_train_resampled,
#     'X_test': X_test,
#     'y_test': y_test,
#     'X_val': X_val,
#     'y_val': y_val,
# }

# # Upload each DataFrame as CSV to the "508group" bucket
# for key, df in data_frames.items():
#     try:
#         # Convert DataFrame to CSV string
#         csv_buffer = StringIO()
#         df.to_csv(csv_buffer, index=False)
        
#         # Upload CSV string to S3
#         s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=f'ADS508_project/cleandata/{key}.csv')
#         print(f"Successfully uploaded {key}")
#     except Exception as e:
#         print(f"Error uploading {key} to S3: {e}")

# # Define the S3 paths for the data now that they are uploaded
# s3_paths = {
#     'X_train': 's3://508group/ADS508_project/cleandata/X_train.csv',
#     'y_train_resampled': 's3://508group/ADS508_project/cleandata/y_train_resampled.csv',
#     'X_test': 's3://508group/ADS508_project/cleandata/X_test.csv',
#     'y_test':  's3://508group/ADS508_project/cleandata/y_test.csv',
#     'X_val': 's3://508group/ADS508_project/cleandata/X_val.csv',
#     'y_val': 's3://508group/ADS508_project/cleandata/y_val.csv',
# }
# output_path = 's3://508group/ADS508_project/output/'

# You can now use `s3_paths` dictionary to access your data in S3 for any further processing


In [None]:
# !pip install -U sagemaker

In [None]:
# pip install XGBoost

In [None]:
# import boto3
# import pandas as pd
# from io import StringIO

# # Create an S3 client
# s3 = boto3.client('s3')

# # Specify the name of the bucket
# bucket_name = '508group'

# # Define the DataFrames
# data_frames = {
#     'X_train': X_train,
#     'y_train_resampled': y_train_resampled,
#     'X_test': X_test,
#     'y_test': y_test,
#     'X_val': X_val,
#     'y_val': y_val,
# }

# # Upload each DataFrame as CSV to the "508group" bucket
# for key, df in data_frames.items():
#     try:
#         # Convert DataFrame to CSV string
#         csv_buffer = StringIO()
#         df.to_csv(csv_buffer, index=False)
        
#         # Upload CSV string to S3
#         s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=f'ADS508_project/cleandata/{key}.csv')
#     except Exception as e:
#         print(f"Error uploading {key} to S3: {e}")


In [None]:
# # Define the S3 paths for training data, model output, etc.
# X_train = 's3://508group/ADS508_project/cleandata/X_train.csv'
# y_train_resampled = 's3://508group/ADS508_project/cleandata/y_train_resampled.csv'
# X_test = 's3://508group/ADS508_project/cleandata/X_test.csv'
# y_test =  's3://508group/ADS508_project/cleandata/y_test.csv'
# X_val = 's3://508group/ADS508_project/cleandata/X_val.csv'
# y_val = 's3://508group/ADS508_project/cleandata/y_val.csv'
# output_path = 's3://508group/ADS508_project/output/'


In [None]:
# import sagemaker
# from sagemaker import get_execution_role
# from sagemaker.inputs import TrainingInput
# from sagemaker.estimator import Estimator

# # Step 1: Define S3 paths for training data, model output, etc.
# s3_input_train = 's3://508group/ADS508_project/cleandata/X_train.csv'
# s3_input_test = 's3://508group/ADS508_project/cleandata/X_test.csv'
# output_path = 's3://508group/ADS508_project/output/'

# # Step 2: Create a SageMaker session and specify the IAM role
# sagemaker_session = sagemaker.Session()
# role = get_execution_role()

# # Step 3: Define the SageMaker XGBoost estimator with the specific version and hyperparameters
# xgb_estimator = Estimator(image_uri=sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, "1.7-1"),
#                           role=role,
#                           instance_count=1,
#                           instance_type='ml.m5.large',
#                           output_path=output_path,
#                           sagemaker_session=sagemaker_session,
#                           hyperparameters={'num_round': '10'})  # Specify the number of boosting rounds

# # Step 4: Train the XGBoost model
# xgb_estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

In [None]:
# import boto3
# import os

# # Initialize a boto3 client
# s3 = boto3.client('s3')

# # Function to download a file from an S3 bucket
# def download_file(bucket_name, object_key, local_filename):
#     try:
#         s3.download_file(Bucket=bucket_name, Key=object_key, Filename=local_filename)
#         print(f"Downloaded {object_key} to {local_filename}")
#     except Exception as e:
#         print(f"Error downloading {object_key}: {e}")

# # Define your bucket name
# bucket_name = '508group'

# # List of artifacts to download
# artifacts = {
#     "feature_engineering_code": "ADS508_project/output/autopilot/automl-2024-03-30-19-48-01-733/sagemaker-automl-candidates/automl-2024-03-30-19-48-01-733-pr-1-5cdcc23566e2483f97658942125/generated_module/candidate_data_processors/dpp9.py",
#     # Add other artifacts here following the same structure
# }

# # Download each artifact
# for name, key in artifacts.items():
#     download_file(bucket_name, key, f"{name}.py")


In [None]:
# import boto3
# import os

# # Define key for the feature engineering code
# fe_code_key = 'ADS508_project/output/autopilot/automl-2024-03-30-19-48-01-733/sagemaker-automl-candidates/automl-2024-03-30-19-48-01-733-pr-1-5cdcc23566e2483f97658942125/generated_module/candidate_data_processors/dpp3.py'
# download_from_s3(bucket_name, fe_code_key, 'dpp3.py')

In [None]:
# import boto3
# import os

# s3 = boto3.client('s3')

# def download_from_s3(bucket_name, s3_key, local_path):
#     try:
#         s3.download_file(bucket_name, s3_key, local_path)
#         print(f"Downloaded {s3_key} to {local_path}")
#     except Exception as e:
#         print(f"Error downloading {s3_key}: {e}")

# bucket_name = '508group' 

In [None]:
# import boto3
# import pandas as pd
# from io import StringIO

# # Create an S3 client
# s3 = boto3.client('s3')

# # Specify the name of the bucket
# bucket_name = '508group'

# # Define the DataFrames
# data_frames = {
#     'X_train': X_train,
#     'y_train_resampled': y_train_resampled,
#     'X_test': X_test,
#     'y_test': y_test,
#     'X_val': X_val,
#     'y_val': y_val,
# }

# # Upload each DataFrame as CSV to the "508group" bucket
# for key, df in data_frames.items():
#     try:
#         # Convert DataFrame to CSV string
#         csv_buffer = StringIO()
#         df.to_csv(csv_buffer, index=False)
        
#         # Upload CSV string to S3
#         s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=f'ADS508_project/cleandata/{key}.csv')
#     except Exception as e:
#         print(f"Error uploading {key} to S3: {e}")



In [None]:
# import sagemaker
# from sagemaker.tensorflow import TensorFlow

# # Define your SageMaker session
# sagemaker_session = sagemaker.Session()

In [None]:
# # Define the S3 paths for training data, model output, etc.
# X_train = 's3://508group/ADS508_project/cleandata/X_train.csv'
# y_train_resampled = 's3://508group/ADS508_project/cleandata/y_train_resampled.csv'
# X_test = 's3://508group/ADS508_project/cleandata/X_test.csv'
# y_test =  's3://508group/ADS508_project/cleandata/y_test.csv'
# X_val = 's3://508group/ADS508_project/cleandata/X_val.csv'
# y_val = 's3://508group/ADS508_project/cleandata/y_val.csv'
# output_path = 's3://508group/ADS508_project/output/'


In [None]:

# # Load y_train_resampled.csv
# y_train_resampled_df = pd.read_csv('s3://508group/ADS508_project/cleandata/y_train_resampled.csv')

# # Prefix labels with '__label__'
# y_train_resampled_df['sentimentoutcome'] = '__label__' + y_train_resampled_df['sentimentoutcome'].astype(str)

# # Save the modified DataFrame back to y_train_resampled.csv
# y_train_resampled_df.to_csv('s3://508group/ADS508_project/cleandata/y_train_resampled.csv', index=False)


# # Load y_val.csv
# y_val_df = pd.read_csv('s3://508group/ADS508_project/cleandata/y_val.csv')

# # Prefix labels with '__label__'
# y_val_df['sentimentoutcome'] = '__label__' + y_val_df['sentimentoutcome'].astype(str)

# # Save the modified DataFrame back to y_val.csv
# y_val_df.to_csv('s3://508group/ADS508_project/cleandata/y_val.csv', index=False)

In [None]:

# import boto3
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from joblib import dump
# import os

# # Load data from S3
# s3 = boto3.client('s3')
# bucket_name = '508group'

# def load_data_from_s3(key):
#     response = s3.get_object(Bucket=bucket_name, Key=key)
#     data = pd.read_csv(response['Body'])
#     return data

# # Load training data
# X_train = load_data_from_s3('ADS508_project/cleandata/X_train.csv')
# y_train = load_data_from_s3('ADS508_project/cleandata/y_train_resampled.csv')

# # Train logistic regression model
# model = LogisticRegression()
# model.fit(X_train_final, y_train_final)

# # Evaluate model
# y_pred = model.predict(X_val)
# accuracy = accuracy_score(y_val, y_pred)
# classification_rep = classification_report(y_val, y_pred)

# # Save the trained model
# output_dir = 'model'
# os.makedirs(output_dir, exist_ok=True)
# model_path = os.path.join(output_dir, 'model.joblib')
# dump(model, model_path)

# # Save evaluation metrics
# metrics_path = os.path.join(output_dir, 'metrics.txt')
# with open(metrics_path, 'w') as f:
#     f.write(f'Accuracy: {accuracy}\n')
#     f.write('Classification Report:\n')
#     f.write(classification_rep)

# # Upload the trained model and metrics to S3
# s3.upload_file(model_path, bucket_name, 'ADS508_project/output/model.joblib')
# s3.upload_file(metrics_path, bucket_name, 'ADS508_project/output/metrics.txt')


## Run Data Bias Analysis

In [None]:
# Create folder for specific timestamp
import time

timestamp = int(time.time())

bias_data_s3_uri = sess.upload_data(bucket=bucket, key_prefix="ADS508_project/bias-detection-{}".format(timestamp), path=path)
bias_data_s3_uri

In [None]:
!aws s3 ls $bias_data_s3_uri

In [None]:
balanced_bias_data_s3_uri = sess.upload_data(
    bucket=bucket, key_prefix="ADS508_project/bias-detection-{}".format(timestamp), path=path_balance
)
balanced_bias_data_s3_uri

In [None]:
!aws s3 ls $balanced_bias_data_s3_uri

In [None]:
balanced_bias_data_jsonlines_s3_uri = sess.upload_data(
    bucket=bucket, key_prefix="ADS508_project/bias-detection-{}".format(timestamp), path=path_jsonlines
)
balanced_bias_data_jsonlines_s3_uri

In [None]:
!aws s3 ls $balanced_bias_data_jsonlines_s3_uri

In [None]:
#!pip install --force-reinstall -q smclarify

from smclarify.bias import report
from typing import Dict
from collections import defaultdict
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!aws s3 cp $bias_data_s3_uri ./data-clarify/

In [None]:
!aws s3 cp $balanced_bias_data_s3_uri ./data-clarify/

### Calculate Bias Metrics for all Data
#### might not needed from ===>

facet_column = report.FacetColumn(name="candidatepoll")

label_column = report.LabelColumn(
    name="sentiment_category", 
    series=df_combined["sentiment_category"],
    positive_label_values=[5]
)

In [None]:
report.bias_report(
    df=df_combined, 
    facet_column=facet_column, 
    label_column=label_column, 
    stage_type=report.StageType.PRE_TRAINING, 
    metrics=["CI", "DPL", "KL", "JS", "LP", "TVD", "KS"]
)

### Calculate Bias Metrics for Balanced Data

In [None]:
from smclarify.bias import report

facet_column = report.FacetColumn(name="candidatepoll")

label_column = report.LabelColumn(
    name="sentiment_category", 
    series=df_balanced["sentiment_category"], 
    positive_label_values=[5]
)

In [None]:
report.bias_report(
    df=df_balanced, 
    facet_column=facet_column, 
    label_column=label_column, 
    stage_type=report.StageType.PRE_TRAINING, 
    metrics=["CI", "DPL", "KL", "JS", "LP", "TVD", "KS"]
)

#### <=== TO MIGHT NOT NEEDED

### Run Data Bias Analysis (Pre-training)

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role, 
    instance_count=1, 
    instance_type="ml.c5.xlarge", 
    sagemaker_session=sess
)

#### Pre-Training Bias

In [None]:
bias_report_output_path = "s3://{}/ADS508_project/clarify".format(bucket)

bias_data_config = clarify.DataConfig(
    s3_data_input_path=bias_data_s3_uri,
    s3_output_path=bias_report_output_path,
    label="sentiment_category",
    headers=df_combined.columns.to_list(),
    dataset_type="text/csv",
)


In [None]:
# Trump
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[5],
    facet_name="candidatepoll",
    facet_values_or_threshold=["Trump"],
)

clarify_processor.run_pre_training_bias(
    data_config=bias_data_config, 
    data_bias_config=bias_config, 
    methods=["CI", "DPL", "KL", "JS", "LP", "TVD", "KS"],
    wait=False, 
    logs=False
)

run_pre_training_bias_processing_job_name = clarify_processor.latest_job.job_name
run_pre_training_bias_processing_job_name



In [None]:
# Biden
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[5],
    facet_name="candidatepoll",
    facet_values_or_threshold=["Biden"],
)

clarify_processor.run_pre_training_bias(
    data_config=bias_data_config, 
    data_bias_config=bias_config, 
    methods=["CI", "DPL", "KL", "JS", "LP", "TVD", "KS"],
    wait=False, 
    logs=False
)

run_pre_training_bias_processing_job_name = clarify_processor.latest_job.job_name
run_pre_training_bias_processing_job_name


#### Download report

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/{}">Processing Job</a></b>'.format(
            region, run_pre_training_bias_processing_job_name
        )
    )
)
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, run_pre_training_bias_processing_job_name
        )
    )
)

from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Processing Job Has Completed</b>'.format(
            bucket, run_pre_training_bias_processing_job_name, region
        )
    )
)

In [None]:
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=run_pre_training_bias_processing_job_name, sagemaker_session=sess
)

processing_job_description = running_processor.describe()

print(processing_job_description)

In [None]:
!aws s3 ls $bias_report_output_path/

In [None]:
!aws s3 cp --recursive $bias_report_output_path ./generated_bias_report/

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="./generated_bias_report/report.html">Bias Report</a></b>'))

In [None]:
path = "./data-clarify/amazon_reviews_us_giftcards_software_videogames.csv"
df.to_csv(path, index=False, header=True)

In [None]:
_testing