In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']
data = pd.read_csv('data.csv', names=columns, header=None)
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)
data.isnull().sum()

In [None]:
data.nunique()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

In [None]:
data.shape

In [None]:
# sentiment distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=data, x=data['sentiment'], order=['Positive', 'Negative', 'Neutral', 'Irrelevant'], hue=data['sentiment'], palette='viridis')
plt.xlabel('Sentiment')
plt.ylabel('Number of tweets')
plt.title('Number of tweets per sentiment')
plt.show()

In [None]:
# tweets per entity
entity_count = data['entity'].value_counts()

plt.figure(figsize=(12, 8))
sns.countplot(data=data, y=data['entity'], order=entity_count.index, palette='viridis', hue='entity')
plt.xlabel('Number of tweets')
plt.ylabel('Entity')
plt.title('Number of tweets per entity')
plt.show()

In [None]:
# sentiment distribution per entity
entity_count = data['entity'].value_counts()

plt.figure(figsize=(12, 8))
sns.countplot(data=data, y=data['entity'], order=entity_count.index, palette='viridis', hue='sentiment')
plt.xlabel('Number of tweets')
plt.ylabel('Entity')
plt.title('Number of tweets per entity')
plt.show()

In [None]:
# message length distribution per sentiment
data['message_length'] = data['tweet_content'].apply(lambda x: len(x))

fig, ax = plt.subplots(1, 4, figsize=(20, 6))

for i, sentiment in enumerate(data['sentiment'].unique()):
    sns.histplot(data=data[data['sentiment'] == sentiment], x='message_length', ax=ax[i], color='skyblue', kde=True, bins=30)
    ax[i].set_title(sentiment)
    ax[i].set_xlabel('Message length')
    ax[i].set_ylabel('Number of tweets')

# plt.tight_layout()
plt.show()

In [None]:
# companies with most positive tweets
positive_tweets = data[data['sentiment'] == 'Positive']
entity_count = positive_tweets['entity'].value_counts()

plt.figure(figsize=(12, 8))
sns.countplot(data=positive_tweets, y=positive_tweets['entity'], order=entity_count.index, palette='viridis')
plt.xlabel('Number of tweets')
plt.ylabel('Entity')
plt.title('Number of positive tweets per entity')
plt.show()

In [None]:
# companies with most negative tweets
negative_tweets = data[data['sentiment'] == 'Negative']
entity_count = negative_tweets['entity'].value_counts()

plt.figure(figsize=(12, 8))
sns.countplot(data=negative_tweets, y=negative_tweets['entity'], order=entity_count.index, palette='viridis')
plt.xlabel('Number of tweets')
plt.ylabel('Entity')
plt.title('Number of negative tweets per entity')
plt.show()
