In [None]:
import sagemaker
import boto3
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = 'sentiment-analysis-storage'


In [None]:
s3 = boto3.client('s3')
s3.download_file(bucket, 'Sentiment140.csv', 'Sentiment140.csv')

# Load data into a DataFrame
data = pd.read_csv('Sentiment140.csv', encoding='ISO-8859-1', names=["target", "id", "date", "flag", "user", "text"])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")


In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=data, palette='viridis')
plt.title("Sentiment Distribution in Sentiment140 Dataset")
plt.xlabel("Sentiment (0: Sad, 4: Happy)")
plt.ylabel("Count")
plt.show()


In [None]:
# Calculate text length
data['text_length'] = data['text'].apply(len)

plt.figure(figsize=(8, 5))
sns.histplot(data['text_length'], kde=True, color="blue")
plt.title("Distribution of Tweet Lengths")
plt.xlabel("Tweet Length (characters)")
plt.ylabel("Frequency")
plt.show()


In [None]:
from wordcloud import WordCloud

# Separate positive and negative tweets
positive_text = ' '.join(data[data['target'] == 4]['text'])
negative_text = ' '.join(data[data['target'] == 0]['text'])

# Generate word clouds
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
wordcloud = WordCloud(width=400, height=300, max_font_size=80, max_words=100, background_color="white").generate(positive_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Positive Sentiment Word Cloud")
plt.axis("off")

plt.subplot(1, 2, 2)
wordcloud = WordCloud(width=400, height=300, max_font_size=80, max_words=100, background_color="white").generate(negative_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Negative Sentiment Word Cloud")
plt.axis("off")

plt.show()


In [None]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)
    # Remove special characters, numbers, punctuations
    text = re.sub(r'\W', ' ', text)
    return text.strip()

# Apply preprocessing
data['processed_text'] = data['text'].apply(preprocess_text)

# Map labels to BlazingText format
data['label'] = data['target'].apply(lambda x: '__label__0' if x == 0 else '__label__4')

# Save preprocessed data
data[['label', 'processed_text']].to_csv('processed_twitter140.txt', sep=' ', index=False, header=False)
s3.upload_file('processed_twitter140.txt', bucket, 'processed_twitter140.txt')


In [None]:
blazingtext_image = sagemaker.image_uris.retrieve(
    region=boto3.Session().region_name, 
    framework='blazingtext', 
    version='latest'
)

blazingtext = sagemaker.estimator.Estimator(
    blazingtext_image,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/output',
    sagemaker_session=sagemaker_session
)

# Set hyperparameters
blazingtext.set_hyperparameters(
    mode='supervised',
    epochs=10,
    min_count=2,
    word_ngrams=2
)

# Start training
train_data = f's3://{bucket}/processed_twitter140.txt'
blazingtext.fit({'train': train_data})


In [None]:
predictor = blazingtext.deploy(initial_instance_count=1, instance_type='ml.t2.medium')


In [None]:
def predict_sentiment(text):
    preprocessed_text = preprocess_text(text)
    response = predictor.predict(preprocessed_text)
    return response

# Example
print(predict_sentiment("I love using SageMaker!"))


In [None]:
predictor.delete_endpoint()
