In [2]:
pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [23]:
from gensim.models import Word2Vec
import numpy as np
import re
import pandas as pd
from sklearn.model_selection import train_test_split

In [24]:
# ----------------------
# Dataset
# ----------------------
dataset = pd.read_csv("sentiment_analysis_game_review.csv")


df = dataset.copy()
df = df.dropna(subset=["Review", "Rating"])

# Create binary label with a "neutral" gap

TEXT_COL = "Review"
RATING_COL = "Rating"   # 1–10

df = dataset.copy()
df = df.dropna(subset=[TEXT_COL, RATING_COL])
df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="coerce")
df = df.dropna(subset=[RATING_COL])

# positive if >=7, negative if <=4, else neutral
df["sentiment_str"] = "neutral"
df.loc[df[RATING_COL] >= 7, "sentiment_str"] = "positive"
df.loc[df[RATING_COL] <= 4, "sentiment_str"] = "negative"


# keep only positive/negative
df = df[df["sentiment_str"] != "neutral"]

df = df.dropna(subset=["sentiment_str"])

# Split (stratify keeps label balance similar)
X = df[TEXT_COL].astype(str).tolist()          # raw text
y = df["sentiment_str"].tolist()              # "positive"/"negative"

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# If you need list of (text, label) tuples:
train_sentences = list(zip(X_train, y_train))
test_sentences  = list(zip(X_test, y_test))



# sentences = [
#     ("The movie was amazing and full of heart", 1),
#     ("A boring plot with terrible acting", 0),
#     ("I loved the characters but hated the ending", 0),
#     ("The film was not good at all", 0),
#     ("Surprisingly fun and well written", 1),
#     ("I expected more it was disappointing", 0),
#     ("Absolutely fantastic experience", 1),
#     ("The story was dull and predictable", 0)
# ]

In [75]:
# ----------------------
# Preprocessing
# ----------------------
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

In [98]:
train_corpus = [tokenize(text) for text, _ in train_sentences]
test_corpus  = [tokenize(text) for text, _ in test_sentences]

In [93]:
# ----------------------
# Train Word2Vec
# ----------------------
model = Word2Vec(train_corpus, vector_size=40, window=4, min_count=1, sg=2)

In [94]:
# ----------------------
# Sentence vector
# ----------------------
def sentence_vector(tokens):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

In [95]:
def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def predict(sentence):
    v = sentence_vector(tokenize(sentence))
    return "positive" if cosine(v, pos_vec) > cosine(v, neg_vec) else "negative"

In [96]:
# ----------------------
# Simple sentiment prototypes
# ----------------------
X_train_vec = np.array([sentence_vector(tokens) for tokens in train_corpus])

# Map y_train strings -> 0/1
y_train_bin = np.array([1 if lbl == "positive" else 0 for lbl in y_train])

pos_vec = X_train_vec[y_train_bin == 1].mean(axis=0)
neg_vec = X_train_vec[y_train_bin == 0].mean(axis=0)

# Accuracy on test
preds = [predict(text) for text in X_test]
acc = sum(p == t for p, t in zip(preds, y_test)) / len(y_test)
print("Accuracy:", acc)

Accuracy: 0.6530172413793104


  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
