# Data Analysis


In [1]:
# general_loader.py
import json
import pandas as pd
import numpy as np

## loading data from drive

In [3]:
from google.colab import drive

drive.mount('/content/drive')

dataset_path = '/content/drive/MyDrive/SemEval-2026/train/eng_laptop_train_alltasks.jsonl'

data = []
with open(dataset_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} sentence-level entries")
print("Sample entry:\n", data[0])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 4076 sentence-level entries
Sample entry:
 {'ID': 'laptop_quad_dev_1', 'Text': 'this unit is ` ` pretty ` ` and stylish , so my high school daughter was attracted to it for that reason .', 'Quadruplet': [{'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'pretty', 'VA': '7.12#7.12'}, {'Aspect': 'unit', 'Category': 'LAPTOP#DESIGN_FEATURES', 'Opinion': 'stylish', 'VA': '7.12#7.12'}]}


In [4]:
def load_general_dataset(path):
    """
    Load SemEval dataset in general format (one row per quadruplet).

    Returns: DataFrame with columns:
    ID, Text, Aspect, Category, Opinion, VA, V, A
    """
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))

    records = []
    for row in data:
        for quad in row["Quadruplet"]:
            V, A = map(float, quad["VA"].split("#"))
            records.append({
                "ID": row["ID"],
                "Text": row["Text"],
                "Aspect": quad["Aspect"],
                "Category": quad["Category"],
                "Opinion": quad["Opinion"],
                "V": V,
                "A": A
            })

    return pd.DataFrame(records)

In [5]:
df = load_general_dataset(dataset_path)
df.shape

(5773, 7)

In [7]:
print("Sentence-level entries:", len(data))
print("Quadruplet-level entries:", len(df))


Sentence-level entries: 4076
Quadruplet-level entries: 5773


In [8]:
print("Number of columns:", len(df.columns))
print("Columns:", df.columns.tolist())


Number of columns: 7
Columns: ['ID', 'Text', 'Aspect', 'Category', 'Opinion', 'V', 'A']


In [17]:
print("Unique Categories:", df["Category"].nunique())
print("Unique Aspects:", df["Aspect"].nunique())
print("Unique Opinions:", df["Opinion"].nunique())


Unique Categories: 121
Unique Aspects: 956
Unique Opinions: 1156


## Analysis on Valence Arousal scores

In [24]:
print("Valence stats:\n", df["V"].describe())
print("Arousal stats:\n", df["A"].describe())



Valence stats:
 count    5773.000000
mean        5.936842
std         1.763164
min         1.000000
25%         4.380000
50%         6.620000
75%         7.380000
max         8.830000
Name: V, dtype: float64
Arousal stats:
 count    5773.00000
mean        6.66797
std         1.03192
min         3.83000
25%         5.83000
50%         6.88000
75%         7.50000
max         8.83000
Name: A, dtype: float64


In [25]:
quads_per_sentence = df.groupby("ID").size()
print("Mean quadruplets per sentence:", quads_per_sentence.mean())
print("Max quadruplets in a sentence:", quads_per_sentence.max())


Mean quadruplets per sentence: 1.4163395485770363
Max quadruplets in a sentence: 10


## Data Integrity check

In [28]:
# some aspects and opinions aren't in the sentence therefore considered implicit and a string "NULL"
# These don't indicate annotation error but rather coresspond to implicit aspects/opinions

null_aspects = (df["Aspect"] == "NULL").sum()
null_opinions = (df["Opinion"] == "NULL").sum()

print("Implicit Aspects (string 'NULL'):", null_aspects)
print("Implicit Opinions (string 'NULL'):", null_opinions)

print(f"{null_aspects/len(df)*100:.2f}% aspects are implicit")
print(f"{null_opinions/len(df)*100:.2f}% opinions are implicit")



Implicit Aspects (string 'NULL'): 1254
Implicit Opinions (string 'NULL'): 1583
21.72% aspects are implicit
27.42% opinions are implicit


In [30]:
positive = (df["V"] > 5).sum()
negative = (df["V"] < 5).sum()
neutral  = (df["V"] == 5).sum()

print("Positive samples:", positive)
print("Negative samples:", negative)
print("Neutral samples:", neutral)


Positive samples: 3788
Negative samples: 1799
Neutral samples: 186
