# Data Exploration

### 1.  Dataset Creation

In [None]:
import os
from datasets import load_dataset

# 1.Path definition
# The storage path of the dataset
output_dir = "../data/raw/"
output_filename = "sentiment140_50k.csv"
output_filepath = os.path.join(output_dir, output_filename)

os.makedirs(output_dir, exist_ok=True)

# Define the cache path used when downloading Hugging Face
cache_directory = "../data/raw/hf_cache/"
os.makedirs(cache_directory, exist_ok=True)

try:
    # 2.Load the dataset
    print("Start loading 'Sentiment140' dataset...")
    dataset_name = "stanfordnlp/sentiment140"
    dataset = load_dataset(dataset_name, cache_dir=cache_directory)
    print("Dataset loading successful!")

    # 3.Data sampling
    # Randomly shuffle the entire dataset and then sample the first 50,000 data entries.
    print("\nRandom sampling is being conducted, and 50,000 pieces of data are being selected...")
    sample_dataset = dataset['train'].shuffle(seed=42).select(range(50000))
    
    # 4.Generate the local original dataset
    # Save the sampled data set as a single CSV file
    print(f"Save 50,000 pieces of data to: {os.path.abspath(output_filepath)}")
    sample_dataset.to_csv(output_filepath)
    
    print("\nTask completed！A CSV file containing 50,000 pieces of data has been created.")
    print(f"File name：'{output_filename}'")
    print(f"Storage location：{os.path.abspath(output_dir)}")

except Exception as e:
    print(f"Making mistakes when processing data: {e}")

Start loading 'Sentiment140' dataset...


Downloading data: 100%|██████████| 81.4M/81.4M [00:06<00:00, 12.8MB/s]
Generating train split: 100%|██████████| 1600000/1600000 [00:16<00:00, 94977.47 examples/s] 
Generating test split: 100%|██████████| 498/498 [00:00<00:00, 66881.09 examples/s]


Dataset loading successful!

Random sampling is being conducted, and 50,000 pieces of data are being selected...
Save 50,000 pieces of data to: /Users/yaoyue/Desktop/CS/MSc Project/Project/uncertainty-sentiment-analysis/data/raw/sentiment140_50k.csv


Creating CSV from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 157.80ba/s]


Task completed！A CSV file containing 50,000 pieces of data has been created.
File name：'sentiment140_50k.csv'
Storage location：/Users/yaoyue/Desktop/CS/MSc Project/Project/uncertainty-sentiment-analysis/data/raw





### 2.Data Loading

In [None]:
import pandas as pd

print("Loading the locally downloaded dataset...")
raw_data_path = '../data/raw/sentiment140_50k.csv'

df = pd.read_csv(raw_data_path)

print(f"Successfully loaded {len(df)} pieces of data.")
print("Data Preview:")
print(df.head())
df.info()

Loading the locally downloaded dataset...
Successfully loaded 50000 pieces of data.
Data Preview:
                                                text  \
0  why am i awake so early?  damn projects. super...   
1  watching church online because I'd be half an ...   
2                                         Hillsong!    
3  is at Stafford Train Station and just watched ...   
4           thanks everyone for the follow fridays!    

                           date         user  sentiment     query  
0  Sun Jun 07 07:43:33 PDT 2009  _stacey_rae          0  NO_QUERY  
1  Sun May 31 06:16:45 PDT 2009     Trollyjd          0  NO_QUERY  
2  Fri May 29 17:35:07 PDT 2009     ffaithyy          4  NO_QUERY  
3  Fri Jun 19 23:28:43 PDT 2009   VCasambros          0  NO_QUERY  
4  Fri Jun 05 17:59:44 PDT 2009   angela_woo          4  NO_QUERY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  -----

### 3.Data cleaning

In [9]:
import re

print("\nClean the text and re-map the labels...")

# Text cleaning function
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URL
    text = re.sub(r'@\w+', '', text)  # Remove username
    text = text.replace('#', '')  # remove'#'
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    text = ' '.join(text.split())  # Remove unnecessary spaces
    return text


df['text'] = df['text'].apply(clean_text)

# Re-map the labels
label_map = {
    0: 0,  # Negative
    2: 1,  # Neutral
    4: 2   # Positive
}
df['sentiment'] = df['sentiment'].map(label_map)
df['sentiment'] = df['sentiment'].astype(int)

# Delete rows that contain null values
df.dropna(inplace=True)
df = df[df['text'] != ''] # Keep the data as non-empty.

print("Data cleaning and label remapping have been completed.")
print("Updated data preview:")
print(df.head())
print("\nLabel distribution:")
print(df['sentiment'].value_counts())


Clean the text and re-map the labels...
Data cleaning and label remapping have been completed.
Updated data preview:
                                                text  \
0  why am i awake so early damn projects super ne...   
1  watching church online because id be half an h...   
2                                           hillsong   
3  is at stafford train station and just watched ...   
4             thanks everyone for the follow fridays   

                           date         user  sentiment     query  
0  Sun Jun 07 07:43:33 PDT 2009  _stacey_rae          0  NO_QUERY  
1  Sun May 31 06:16:45 PDT 2009     Trollyjd          0  NO_QUERY  
2  Fri May 29 17:35:07 PDT 2009     ffaithyy          2  NO_QUERY  
3  Fri Jun 19 23:28:43 PDT 2009   VCasambros          0  NO_QUERY  
4  Fri Jun 05 17:59:44 PDT 2009   angela_woo          2  NO_QUERY  

Label distribution:
sentiment
0    25043
2    24858
Name: count, dtype: int64


### 4.Divide the dataset into training set, validation set and test set

In [11]:
from sklearn.model_selection import train_test_split

print("\nDivide the dataset into training set, validation set and test set...")

# Divide the data into a 80% training set and a 20% test set.
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment'] # Keep the original proportion of label distribution
)

# Divide 20% of the temporary data into a 10% validation set and a 10% test set.
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['sentiment']
)

print(f"Size of Training set: {len(train_df)}")
print(f"Size of Validation set: {len(val_df)}")
print(f"Size of Test set: {len(test_df)}")

processed_dir = '../data/processed/'
os.makedirs(processed_dir, exist_ok=True)

train_df.to_csv(os.path.join(processed_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(processed_dir, 'validation.csv'), index=False)
test_df.to_csv(os.path.join(processed_dir, 'test.csv'), index=False)

print("All the processed datasets have been successfully saved!")


Divide the dataset into training set, validation set and test set...
Size of Training set: 39920
Size of Validation set: 4990
Size of Test set: 4991
All the processed datasets have been successfully saved!
