In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Colab Notebooks/NLP2

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/NLP2


In [14]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('fifa_world_cup_2022_tweets.csv')
# Keep only necessary columns INCLUDING 'split'
df = df[['Tweet', 'Sentiment']]

# Get total number of tweets
total_tweets = len(df)
print(f"**Total number of tweets:** {total_tweets}")
print("\n")

# Count tweets by sentiment class
sentiment_counts = df['Sentiment'].value_counts()

print("**Sentiment Distribution:**")
print("-" * 30)
for sentiment, count in sentiment_counts.items():
    percentage = (count / total_tweets) * 100
    print(f"{sentiment}: {count} tweets ({percentage:.2f}%)")

**Total number of tweets:** 22524


**Sentiment Distribution:**
------------------------------
positive: 8489 tweets (37.69%)
neutral: 8251 tweets (36.63%)
negative: 5784 tweets (25.68%)


In [15]:
balanced_dfs = []

for sentiment in ['positive', 'neutral', 'negative']:
    # Get all tweets for this sentiment
    sentiment_df = df[df['Sentiment'] == sentiment]

    # Sample exactly 5784 tweets (without replacement)
    sampled_df = sentiment_df.sample(n=5784, random_state=42)
    balanced_dfs.append(sampled_df)

# Combine all balanced dataframes
balanced_df = pd.concat(balanced_dfs, ignore_index=True)

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the new distribution
print("**New Balanced Distribution:**")
print(balanced_df['Sentiment'].value_counts())
print(f"\nTotal tweets in balanced dataset: {len(balanced_df)}")

**New Balanced Distribution:**
Sentiment
negative    5784
neutral     5784
positive    5784
Name: count, dtype: int64

Total tweets in balanced dataset: 17352


In [16]:
balanced_df

Unnamed: 0,Tweet,Sentiment
0,How in the name of Jesus was that not a good g...,negative
1,@JioCinema your app is lagging on smart tv. Pl...,negative
2,Hey ‚Å¶@benske31‚Å© we gotta watch some ‚Å¶@CanadaSo...,neutral
3,Watching Qatar play is like watching Sunday le...,neutral
4,Are you guys broadcasting in the kitchen as al...,neutral
...,...,...
17347,Next up #FIFAWorldCup #QatarWorldCup2022 #Worl...,neutral
17348,That's really weird #WorldCup2022,negative
17349,Picking Portugal üáµüáπ to be my favorites for the...,positive
17350,@MEXC_Global World cup \nMy Favourite team Eng...,positive


In [17]:
from sklearn.model_selection import train_test_split
import pandas as pd

# First, let's create a temporary split to separate train from val+test
X = balanced_df.drop('Sentiment', axis=1)
y = balanced_df['Sentiment']

# Split into train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp into validation (15%) and test (15%)
# Since temp is 30%, we need to split it 50-50 to get 15% each of the total
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Combine X and y back together for each set
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Display the results
print("**Dataset Split Summary:**")
print("=" * 50)
print(f"Total samples: {len(balanced_df)}")
print(f"Train set: {len(train_df)} samples ({len(train_df)/len(balanced_df)*100:.1f}%)")
print(f"Validation set: {len(val_df)} samples ({len(val_df)/len(balanced_df)*100:.1f}%)")
print(f"Test set: {len(test_df)} samples ({len(test_df)/len(balanced_df)*100:.1f}%)")

print("\n**Train Set Class Distribution:**")
print("-" * 30)
train_counts = train_df['Sentiment'].value_counts()
for sentiment, count in train_counts.items():
    print(f"{sentiment}: {count} samples")

print("\n**Validation Set Class Distribution:**")
print("-" * 30)
val_counts = val_df['Sentiment'].value_counts()
for sentiment, count in val_counts.items():
    print(f"{sentiment}: {count} samples")

print("\n**Test Set Class Distribution:**")
print("-" * 30)
test_counts = test_df['Sentiment'].value_counts()
for sentiment, count in test_counts.items():
    print(f"{sentiment}: {count} samples")

**Dataset Split Summary:**
Total samples: 17352
Train set: 12146 samples (70.0%)
Validation set: 2603 samples (15.0%)
Test set: 2603 samples (15.0%)

**Train Set Class Distribution:**
------------------------------
negative: 4049 samples
neutral: 4049 samples
positive: 4048 samples

**Validation Set Class Distribution:**
------------------------------
neutral: 868 samples
positive: 868 samples
negative: 867 samples

**Test Set Class Distribution:**
------------------------------
positive: 868 samples
negative: 868 samples
neutral: 867 samples


In [18]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
from tqdm import tqdm

# Combine the three datasets with labels
train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'

# Combine all dataframes
combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

print(f"**Combined dataset size:** {len(combined_df)}")
print(f"Train: {len(combined_df[combined_df['split'] == 'train'])}")
print(f"Val: {len(combined_df[combined_df['split'] == 'val'])}")
print(f"Test: {len(combined_df[combined_df['split'] == 'test'])}")

# Load the sentence transformer model
print("\n**Loading sentence-transformers/all-MiniLM-L6-v2 model...**")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each split
print("\n**Generating embeddings...**")
embeddings_dict = {}

for split in ['train', 'val', 'test']:
    split_df = combined_df[combined_df['split'] == split]
    tweets = split_df['Tweet'].tolist()

    print(f"\nProcessing {split} set ({len(tweets)} tweets)...")
    embeddings = model.encode(tweets, show_progress_bar=True, batch_size=32)

    embeddings_dict[split] = {
        'embeddings': embeddings,
        'labels': split_df['Sentiment'].tolist(),
        'tweets': tweets
    }

**Combined dataset size:** 17352
Train: 12146
Val: 2603
Test: 2603

**Loading sentence-transformers/all-MiniLM-L6-v2 model...**

**Generating embeddings...**

Processing train set (12146 tweets)...


Batches:   0%|          | 0/380 [00:00<?, ?it/s]


Processing val set (2603 tweets)...


Batches:   0%|          | 0/82 [00:00<?, ?it/s]


Processing test set (2603 tweets)...


Batches:   0%|          | 0/82 [00:00<?, ?it/s]

In [19]:
combined_df

Unnamed: 0,Tweet,Sentiment,split
0,THEY CALLED OFFSIDES ?!?!? \nNAH THIS IS RIGGE...,negative,train
1,I demand that only @shakira and @KNAAN be able...,negative,train
2,"ùêâùêÆùê¨ùê≠ ùê©ùê¢ùêßùêúùê°ùêûùêù ùê®ùêÆùê´ùê¨ùêûùê•ùêØùêûùê¨ ùê≠ùê® ùêúùê°ùêûùêúùê§ ùê¢ùêü ùê¢ùê≠ ùê¢ùê¨ ùê´ùêûùêöùê•,...",positive,train
3,#Valencia was a former premier player #WorldCu...,neutral,train
4,Corrupt alreadyüòÇ #WorldCup2022,negative,train
...,...,...,...
17347,Enner Valencia's name is chanted around the st...,neutral,test
17348,World Cup- We are excited to see the start of ...,positive,test
17349,I guess because dudes in Quatar get no bitches...,negative,test
17350,#WorldCup2022 #WorldcupQatar2022 Supporting #Q...,neutral,test


In [20]:
embeddings_dict

{'train': {'embeddings': array([[-0.05856002,  0.03054745, -0.004129  , ..., -0.01714644,
          -0.0281125 ,  0.0377028 ],
         [ 0.00328097, -0.04143158,  0.01772959, ..., -0.06149257,
           0.02362919, -0.05104531],
         [ 0.02373547,  0.04418331,  0.08109222, ..., -0.03454321,
          -0.0902932 , -0.00294139],
         ...,
         [ 0.00969438,  0.06337392,  0.07588311, ..., -0.04241866,
          -0.1101367 ,  0.00172768],
         [ 0.00117593,  0.03903282,  0.0147438 , ..., -0.06552655,
           0.01816062, -0.02369412],
         [-0.01741014,  0.05127728,  0.00107904, ..., -0.04754308,
           0.02919964,  0.02829987]], dtype=float32),
  'labels': ['negative',
   'negative',
   'positive',
   'neutral',
   'negative',
   'positive',
   'negative',
   'negative',
   'negative',
   'positive',
   'positive',
   'negative',
   'positive',
   'negative',
   'negative',
   'neutral',
   'neutral',
   'negative',
   'neutral',
   'negative',
   'negative',
 

In [21]:
# Save the embeddings and data
print("\n**Saving embeddings and data...**")
with open('tweet_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)

# Save the combined dataframe (you can remove 'split' column here if you want)
combined_df_to_save = combined_df[['Tweet', 'Sentiment']]
combined_df_to_save.to_csv('combined_tweets.csv', index=False)

print("\nFiles saved:")
print("- tweet_embeddings.pkl (embeddings)")
print("- combined_tweets.csv (combined dataframe)")


**Saving embeddings and data...**

Files saved:
- tweet_embeddings.pkl (embeddings)
- combined_tweets.csv (combined dataframe)


In [22]:
# Load the saved embeddings
print("\n**Loading saved embeddings...**")
with open('tweet_embeddings.pkl', 'rb') as f:
    loaded_embeddings = pickle.load(f)

# Verify the loaded data
print("\n**Verification of loaded data:**")
for split in ['train', 'val', 'test']:
    emb_shape = loaded_embeddings[split]['embeddings'].shape
    num_labels = len(loaded_embeddings[split]['labels'])
    num_tweets = len(loaded_embeddings[split]['tweets'])

    print(f"\n{split.upper()} set:")
    print(f"  - Embeddings shape: {emb_shape}")
    print(f"  - Number of labels: {num_labels}")
    print(f"  - Number of tweets: {num_tweets}")
    print(f"  - Embedding dimension: {emb_shape[1]}")

# Example: Access specific embeddings
print("\n**Example usage of loaded embeddings:**")
train_embeddings = loaded_embeddings['train']['embeddings']
train_labels = loaded_embeddings['train']['labels']
print(f"First tweet embedding shape: {train_embeddings[0].shape}")
print(f"First tweet label: {train_labels[0]}")


**Loading saved embeddings...**

**Verification of loaded data:**

TRAIN set:
  - Embeddings shape: (12146, 384)
  - Number of labels: 12146
  - Number of tweets: 12146
  - Embedding dimension: 384

VAL set:
  - Embeddings shape: (2603, 384)
  - Number of labels: 2603
  - Number of tweets: 2603
  - Embedding dimension: 384

TEST set:
  - Embeddings shape: (2603, 384)
  - Number of labels: 2603
  - Number of tweets: 2603
  - Embedding dimension: 384

**Example usage of loaded embeddings:**
First tweet embedding shape: (384,)
First tweet label: negative
