In [43]:
import pandas as pd

# Load the dataset
file_path = 'fakeReviewData.csv'
raw_data = pd.read_csv(file_path) #reading the csv file

# Displaying the first few rows of the raw data
print(raw_data.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [44]:
# DATA CLEANING

In [45]:
# Handling missing values by removing rows with any NaN values
clean_data = raw_data.dropna()

# Removing the duplicate rows
clean_data = clean_data.drop_duplicates()

# Displaying the first few rows of the cleaned data
print(clean_data.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [47]:
 # NORMALIZING TEXT

In [49]:
import re

def normalize_text(text):

    # Converting text to lowercase
    text = text.lower()

    # Removing punctuation, special characters, and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Applying the normalization function to the text column
clean_data['text_'] = clean_data['text_'].apply(normalize_text)

# Displaying the first few rows of the Normalized data
print(clean_data.head())


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  love this  well made sturdy and very comfortab...  
1  love it a great upgrade from the original  ive...  
2  this pillow saved my back i love the look and ...  
3  missing information on how to use it but it is...  
4  very nice set good quality we have had the set...  


In [50]:
# TOKENIZATION

In [52]:
import nltk
from nltk.tokenize import word_tokenize

# Downloading the NLTK tokenizer models
nltk.download('punkt')
nltk.download('punkt_tab')

# Tokenizing the 'text_' column using NLTK's word_tokenize function
clean_data['tokens'] = clean_data['text_'].apply(word_tokenize)

# Displaying the first few rows of the Tokenized data
print(clean_data.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  
0  [love, this, well, made, sturdy, and, very, co...  
1  [love, it, a, great, upgrade, from, the, origi...  
2  [this, pillow, saved, my, back, i, love, the, ...  
3  [missing, information, on, how, to, use, it, b...  
4  [very, nice, set, good, quality, we, have, had...  


In [54]:
# REMOVING THE STOP WORDS

In [55]:
from nltk.corpus import stopwords

# Downloading stopwords list and tokenizer models
nltk.download('stopwords')

# list of English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):

    filtered_tokens = []

    for word in tokens:

        # Check if the word is not in the set of stopwords
        if word not in stop_words:

            # Add the word to the filtered list if it's not a stopword
            filtered_tokens.append(word)

    return filtered_tokens

clean_data['tokens'] = clean_data['tokens'].apply(remove_stopwords)

# Displaying the first few rows after the stop words have been removed
print(clean_data.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  
0  [love, well, made, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, ive, mine, co...  
2    [pillow, saved, back, love, look, feel, pillow]  
3  [missing, information, use, great, product, pr...  
4       [nice, set, good, quality, set, two, months]  


In [57]:
# LEMATIZATION

In [58]:
from nltk.stem import WordNetLemmatizer

# Downloading the WordNet lemmatizer models
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(tokens):

    # Initialize an empty list to store lemmatized tokens
    lemmatized_tokens = []

    # Iterate over each token in the list
    for word in tokens:

        # Lemmatize the word and add it to the lemmatized_tokens list
        lemmatized_tokens.append(lemmatizer.lemmatize(word,pos='n'))

    return lemmatized_tokens


# Applying lemmatization to the 'tokens' column
clean_data['tokens'] = clean_data['tokens'].apply(apply_lemmatization)

# Displaying the first few rows of the lemmatized data
print(clean_data.head())



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  
0  [love, well, made, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, ive, mine, co...  
2    [pillow, saved, back, love, look, feel, pillow]  
3  [missing, information, use, great, product, pr...  
4        [nice, set, good, quality, set, two, month]  


In [60]:
# Word2Vec TRANSFORMATION

In [59]:
from gensim.models import Word2Vec

# Step 1: Prepare Corpus
# Extract the 'tokens' column as a list of tokenized sentences
corpus = clean_data['tokens'].tolist()

# Step 2: Train Word2Vec Model
# Initialize and train the Word2Vec model
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1)

# Step 3: Create Sentence Vectors
# Function to compute the average word vector for a list of tokens

def get_sentence_vector(tokens, model):

  # Initialize an empty list to store word vectors
  vectors = []

  # Loop through each word in tokens
  for word in tokens:
     # Check if the word exists in the Word2Vec model's vocabulary
      if word in model.wv:
          # Retrieve the word vector and append it to the list
          vectors.append(model.wv[word])


  if vectors:
      return sum(vectors) / len(vectors)
  else:
      return [0] * model.vector_size  # Return zero vector for empty tokens



# Applying the function to compute sentence vectors
clean_data['sentence_vector'] = clean_data['tokens'].apply(lambda x: get_sentence_vector(x, model))

# Step 4: Save Processed Data
clean_data.to_csv("final_fake_review_data.csv", index=False)

# Downloading the final processed file

from google.colab import files
files.download("final_fake_review_data.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [61]:
# DONE

In [63]:
# Displaying the first few rows of the data
data = pd.read_csv('final_fake_review_data.csv');
print(data.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  \
0  ['love', 'well', 'made', 'sturdy', 'comfortabl...   
1  ['love', 'great', 'upgrade', 'original', 'ive'...   
2  ['pillow', 'saved', 'back', 'love', 'look', 'f...   
3  ['missing', 'information', 'use', 'great', 'pr...   
4  ['nice', 'set', 'good', 'quality', 'set', 'two...   

                                     sentence_vector  
0  [ 1.13825691e+00 -1.23274910e+00 

In [65]:
model.wv.most_similar('bad')

[('thats', 0.7340174913406372),
 ('guess', 0.7258275747299194),
 ('terrible', 0.7166757583618164),
 ('horrible', 0.7091423273086548),
 ('sad', 0.7072671055793762),
 ('hate', 0.7050395607948303),
 ('expecting', 0.703579843044281),
 ('saying', 0.6955850720405579),
 ('reformed', 0.6738972663879395),
 ('stupid', 0.6734278202056885)]