In [4]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/yogesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
preprocessed_dir = "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/processed"
embeddings_dir = "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings"
os.makedirs(embeddings_dir, exist_ok=True)

dataset_names = [
    "preprocessed_lemmatization",
    "preprocessed_no_stopwords",
    "preprocessed_stemming",
    "preprocessed_stemming_no_stopwords",
    "preprocessed_no_stopwords_no_lemmatization",
]

def compute_tfidf(texts, dataset_name):
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(texts)
    df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    output_file = os.path.join(embeddings_dir, f"{dataset_name}_tfidf.csv")
    df_tfidf.to_csv(output_file, index=False)
    print(f"TF-IDF embeddings saved: {output_file}")

def compute_bert_embeddings(texts, dataset_name, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    df_bert = pd.DataFrame(embeddings)
    output_file = os.path.join(embeddings_dir, f"{dataset_name}_bert.csv")
    df_bert.to_csv(output_file, index=False)
    print(f"BERT embeddings saved: {output_file}")

def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def compute_glove_embeddings(texts, dataset_name, glove_file_path="/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/grove_data/glove.6B.100d.txt", embedding_dim=100):
    glove_embeddings = load_glove_embeddings(glove_file_path)
    sentence_embeddings = []
    for text in texts:
        tokens = word_tokenize(text.lower())
        valid_embeddings = [glove_embeddings[word] for word in tokens if word in glove_embeddings]
        if valid_embeddings:
            sent_vec = np.mean(valid_embeddings, axis=0)
        else:
            sent_vec = np.zeros(embedding_dim)
        sentence_embeddings.append(sent_vec)
    df_glove = pd.DataFrame(sentence_embeddings)
    output_file = os.path.join(embeddings_dir, f"{dataset_name}_glove.csv")
    df_glove.to_csv(output_file, index=False)
    print(f"GloVe embeddings saved: {output_file}")

for dataset_name in dataset_names:
    file_path = os.path.join(preprocessed_dir, f"{dataset_name}.csv")
    df = pd.read_csv(file_path)
    texts = df["processed_text"].fillna("").astype(str).tolist()
    
    compute_tfidf(texts, dataset_name)
    compute_bert_embeddings(texts, dataset_name)
    compute_glove_embeddings(texts, dataset_name)

print("All embeddings computed and saved in folder:", embeddings_dir)

TF-IDF embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_lemmatization_tfidf.csv


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

BERT embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_lemmatization_bert.csv
GloVe embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_lemmatization_glove.csv
TF-IDF embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_tfidf.csv


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

BERT embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_bert.csv
GloVe embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_glove.csv
TF-IDF embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_tfidf.csv


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

BERT embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_bert.csv
GloVe embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_glove.csv
TF-IDF embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_no_stopwords_tfidf.csv


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

BERT embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_no_stopwords_bert.csv
GloVe embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_no_stopwords_glove.csv
TF-IDF embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_no_lemmatization_tfidf.csv


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

BERT embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_no_lemmatization_bert.csv
GloVe embeddings saved: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_no_lemmatization_glove.csv
All embeddings computed and saved in folder: /home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings
