# Week 9 Sentiment Analysis Notebook

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display


In [None]:
# Load and display the first few rows
df = pd.read_csv('sentiment.csv')
display(df.head())

In [None]:
# Preprocess text and sentiment columns
df['Sentiment'] = df['Sentiment'].str.strip().str.lower()
df['Text'] = df['Text'].str.strip().str.lower()
display(df[['Text', 'Sentiment']].head())

In [None]:
# Sentiment distribution
sent_counts = df['Sentiment'].value_counts()
display(sent_counts)
plt.figure(figsize=(12, 6))
sns.barplot(x=sent_counts.index, y=sent_counts.values, palette='pastel')
plt.xticks(rotation=90)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Text length analysis by sentiment
df['Text_length'] = df['Text'].str.len()
length_means = df.groupby('Sentiment')['Text_length'].mean().sort_values(ascending=False)
display(length_means)
plt.figure(figsize=(12, 6))
sns.barplot(x=length_means.index, y=length_means.values)
plt.xticks(rotation=90)
plt.title('Average Text Length by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Average Length')
plt.show()

In [None]:
# Texts with maximum likes and retweets
max_likes = df['Likes'].max()
max_likes_texts = df[df['Likes'] == max_likes]['Text']
print(f"Text(s) with Maximum Likes ({max_likes}):")
print(max_likes_texts.to_list())

max_retweets = df['Retweets'].max()
max_retweets_df = df[df['Retweets'] == max_retweets][['Text', 'Sentiment']]
print(f"\nText(s) with Maximum Retweets ({max_retweets}):")
display(max_retweets_df)

In [None]:
# Top 5 countries with most positive tweets
positive_tweets = df[df['Sentiment'] == 'positive']
top5_countries = positive_tweets['Country'].value_counts().head(5)
display(top5_countries)
plt.figure(figsize=(10, 6))
sns.barplot(x=top5_countries.index, y=top5_countries.values)
plt.title('Top 5 Countries by Positive Tweets')
plt.xlabel('Country')
plt.ylabel('Number of Positive Tweets')
plt.show()

In [None]:
# Tweets containing 'Travel' in hashtags
travel_tweets = df[df['Hashtags'].str.contains('Travel', case=False, na=False)]
display(travel_tweets[['Text', 'Hashtags']])

In [None]:
# Top 5 negative tweets from 2020 by likes
neg_2020 = df[(df['Year'] == 2020) & (df['Sentiment'] == 'negative')]
top5_neg_2020 = neg_2020.nlargest(5, 'Likes')[['Text', 'Likes']]
display(top5_neg_2020)

In [None]:
# Users and posts with 'happiness' sentiment
happy_posts = df[df['Sentiment'] == 'happiness'][['User', 'Text']]
display(happy_posts)

In [None]:
# Year and Country tweet counts by sentiment
sel = df[df['Sentiment'].isin(['positive', 'negative', 'neutral'])]
year_country_sent = sel.groupby(['Year', 'Country', 'Sentiment']).size().unstack(fill_value=0)
display(year_country_sent)
year_country_sent.plot(kind='bar', stacked=True, figsize=(15, 7))
plt.title('Year-Country Tweet Counts by Sentiment')
plt.xlabel('Year, Country')
plt.ylabel('Tweet Count')
plt.tight_layout()
plt.show()