<a href="https://colab.research.google.com/github/amrutha2413/Sentiment-Analysis/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#Installs genism, a library used for topic modeling and word embeddings
!pip install gensim pandas scikit-learn
#imports the Word2Vec model, which learns vector representation of words
from gensim.models import Word2Vec
#import NumPy which is used for numerical operations and vectors
import numpy as np
import pandas as pd
#imports regular expressions for text cleaning
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score




In [7]:
# ----------------------
# Dataset --> the trainign dataset
# Each item was a tuple so first value is the sentence and second value is the label (1 = positive sentiment, 0 = negative sentiment)
# ----------------------
df = pd.read_csv("/content/dataset.csv")
df.head()
df.columns
print(df.columns)



Index(['app_id', 'app_name', 'review_text', 'review_score', 'review_votes'], dtype='object')


In [8]:
texts = df["review_text"]


labels_raw = df["review_score"]
print("Unique values in df['review_score'] before mapping:", labels_raw.unique())
print("Value counts in df['review_score'] before mapping:\n", labels_raw.value_counts(dropna=False))

# Map review_score to binary sentiment: 1 for positive (score 1), 0 for negative (score -1).
# Also handles potential NaNs by mapping them to 0 (negative) if they somehow appear.
labels = labels_raw.apply(lambda x: 1 if x == 1 else 0)
print("\nLabels after mapping:\n", labels.value_counts(dropna=False))

labels = labels.astype(int)
print("\nLabels after astype(int):\n", labels.value_counts(dropna=False))

sentences = list(zip(texts, labels))

Unique values in df['review_score'] before mapping: [ 1 -1]
Value counts in df['review_score'] before mapping:
 review_score
 1    5260420
-1    1156686
Name: count, dtype: int64

Labels after mapping:
 review_score
1    5260420
0    1156686
Name: count, dtype: int64

Labels after astype(int):
 review_score
1    5260420
0    1156686
Name: count, dtype: int64


In [9]:
#Defines a function to clean and split text into words
def tokenize(text):
    # Ensure text is a string before processing, converting non-strings (like floats) to an empty string
    if not isinstance(text, str):
        text = str(text)
    #coverts all the characters to lowercase
    text = text.lower()
    #removes anything that is not lowercase letter and any whitespaces
    text = re.sub(r"[^a-z\s]", "", text)
    #splits the sentence into a list of words
    return text.split()


In [10]:
#Train / test splitting
X_texts = [text for text, _ in sentences]
y = np.array([label for _, label in sentences])


X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X_texts,
    y,
    test_size = 0.2,
    random_state = 42
)


In [None]:
train_corpus = [tokenize('' if pd.isna(t) else t) for t in X_train_texts] # Tokenizes each text in 'X_train_texts', handling any NaN values by replacing them with an empty string before tokenizing. The result is a list of tokenized training reviews.
test_corpus = [tokenize('' if pd.isna(t) else t) for t in X_test_texts] # Tokenizes each text in 'X_test_texts', handling any NaN values by replacing them with an empty string before tokenizing. The result is a list of tokenized testing reviews.

In [None]:
# ----------------------
# Train Word2Vec
# ----------------------

model = Word2Vec(train_corpus, vector_size=50, window=4, min_count=2, sg=1)
# train_corpus: The list of tokenized sentences to learn from.
# vector_size=50: Each word will be represented by a 50-dimensional vector.
# window=4: Considers 4 words before and 4 words after the current word for context.
# min_count=2: Ignores all words with a total frequency lower than 2.
# sg=1: Uses the Skip-gram model (sg=1) over CBOW (sg=0), which is generally better for capturing semantic relationships.

In [None]:
# ----------------------
# Sentence vector
# ----------------------
def sentence_vector(tokens): # Defines a function that takes a list of 'tokens' (words) and converts them into a single numerical vector representing the sentence's meaning.

    vecs = [model.wv[w] for w in tokens if w in model.wv] # Iterates through each token in the input list. For each token, it checks if the word exists in the Word2Vec model's vocabulary (model.wv).
    # If the word exists, it retrieves its corresponding vector from the Word2Vec model. These vectors are collected into a list called 'vecs'.

    if not vecs: # Checks if the 'vecs' list is empty. This happens if none of the words in the sentence were found in the Word2Vec model's vocabulary.
        return np.zeros(model.vector_size) # If 'vecs' is empty, it returns a NumPy array of zeros with the same dimension as the Word2Vec model's vectors. This acts as a placeholder for sentences with no recognized words.

    return np.mean(vecs, axis=0) # If 'vecs' is not empty, it calculates the mean (average) of all the word vectors in the 'vecs' list along the 0-axis (column-wise). This effectively combines all individual word vectors into a single, aggregated sentence vector.

In [None]:
X_train = np.array([sentence_vector(t) for t in train_corpus]) # Converts each tokenized training review in 'train_corpus' into a numerical sentence vector using the 'sentence_vector' function and stores them as a NumPy array 'X_train'.
X_test = np.array([sentence_vector(t) for t in test_corpus]) # Converts each tokenized testing review in 'test_corpus' into a numerical sentence vector using the 'sentence_vector' function and stores them as a NumPy array 'X_test'.

In [None]:
# ----------------------
# Simple sentiment prototypes
# ----------------------
pos_vec = np.mean(X_train[y_train == 1], axis=0) # Calculates the mean vector for all training reviews labeled as positive (where y_train is 1), creating a 'positive sentiment prototype'.
neg_vec = np.mean(X_train[y_train == 0], axis=0) # Calculates the mean vector for all training reviews labeled as negative (where y_train is 0), creating a 'negative sentiment prototype'.

In [None]:
def cosine(a, b): # Defines a function to calculate the cosine similarity between two vectors 'a' and 'b'.
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) # Computes the dot product of 'a' and 'b' and divides by the product of their L2 norms, effectively measuring the cosine of the angle between them.

def predict_vector(v): # Defines a function that predicts the sentiment label (0 or 1) for a given sentence vector 'v'.
    return 1 if cosine(v, pos_vec) > cosine(v, neg_vec) else 0 # Returns 1 (positive) if 'v' is more similar to the positive prototype ('pos_vec') than to the negative prototype ('neg_vec'), otherwise returns 0 (negative).


def predict(sentence): # Defines a function that takes a raw 'sentence' string and predicts its sentiment as "positive" or "negative".
    v = sentence_vector(tokenize(sentence)) # Converts the input 'sentence' into a numerical vector by first tokenizing it and then applying the 'sentence_vector' function.
    return "positive" if predict_vector(v) == 1 else "negative" # Returns "positive" if the sentence's vector is classified as 1 (positive) by 'predict_vector', otherwise returns "negative".

In [None]:
#Accuracy Evaluation
y_pred = np.array([predict_vector(v) for v in X_test]) # Generates predictions for each sentence vector in the test set (X_test) using the 'predict_vector' function.
accuracy = accuracy_score(y_test, y_pred) # Calculates the accuracy of the model by comparing the predicted labels (y_pred) against the true labels (y_test).
print("Model accuracy:", accuracy) # Prints the calculated accuracy score of the model to the console.
