In [16]:
!pip install nltk
!pip install fuzzywuzzy



Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [17]:
import pandas as pd
import nltk
from fuzzywuzzy import fuzz

nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv('train_data.csv')


In [20]:
import pandas as pd
from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return set(synonyms)

def get_similar_spellings(word):
    spellings = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            spelling_ratio = fuzz.ratio(word, lemma.name())
            if spelling_ratio > 60:
                spellings.add(lemma.name())
    return spellings

In [21]:
warranty_synonyms = get_synonyms("warranty")
guarantee_synonyms = get_synonyms("guarantee")

warranty_similar_spellings = get_similar_spellings("warranty")
guarantee_similar_spellings = get_similar_spellings("guarantee")


all_terms = list(
    warranty_synonyms.union(warranty_similar_spellings) |
    guarantee_synonyms.union(guarantee_similar_spellings) |
    {'warranty', 'guarantee'}
)

print(all_terms)

['insure', 'guaranty', 'ensure', 'warrantee', 'undertake', 'warrant', 'guarantee', 'vouch', 'secure', 'warranty', 'assure']


In [53]:
filtered_rows = df[df['reviewText'].str.contains('|'.join(all_terms), case=False, na=False)]
filtered_rows.to_csv('filtered_data.csv', index=False)



In [55]:
mean_overall_by_asin = filtered_rows.groupby('asin')['overall'].mean()
print(mean_overall_by_asin)


asin
0972683275    5.0
1616825375    5.0
9806010728    4.0
B000001OM4    4.0
B000001OM5    5.0
             ... 
B00009R89L    4.5
B00009R8T5    2.0
B00009R8XD    5.0
B00009R9BF    5.0
B00009RDIF    5.0
Name: overall, Length: 153, dtype: float64


In [56]:
mean_overall_by_asin.to_csv('mean_overall_ratings.csv', header=True)


In [27]:
df[166:168].reviewText

166    Yes, it makes a lot of "mechanical" noise when...
167    This adapter works well with my 1994 Corvette....
Name: reviewText, dtype: object

In [28]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Col

In [29]:
from transformers import AutoModel, AutoTokenizer


In [30]:
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [32]:
tokens_warranty = tokenizer.encode('warranty', return_tensors='pt')
tokens_guarantee = tokenizer.encode('guarantee', return_tensors='pt')

In [33]:
embeddings_warranty = model(tokens_warranty)[0].mean(dim=1)
embeddings_guarantee = model(tokens_guarantee)[0].mean(dim=1)

In [35]:
import torch
similarity_score = torch.nn.functional.cosine_similarity(embeddings_warranty, embeddings_guarantee)
print("Similarity between 'warranty' and 'guarantee':", similarity_score.item())

Similarity between 'warranty' and 'guarantee': 0.745226263999939


In [38]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

# Load pre-trained BERT model for masked language modeling
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Replace 'warranty' and 'guarantee' with [MASK] token to find synonyms
text = "The warranty on this product is excellent, and it gives me a guarantee."
text_masked = text.replace("warranty", "[MASK]").replace("guarantee", "[MASK]")

# Tokenize the text
tokens = tokenizer(text_masked, return_tensors="pt")

# Generate predictions for the masked tokens
with torch.no_grad():
    outputs = model(**tokens)

# Get the predicted token IDs
predicted_token_ids = torch.argmax(outputs.logits, dim=-1)

# Decode the predicted token IDs to words
predicted_words = tokenizer.batch_decode(predicted_token_ids)

# Filter out [CLS], [SEP], and [MASK] tokens
synonyms = [word for word in predicted_words[0].split() if word not in ['[CLS]', '[SEP]', '[MASK]']]

print("Synonyms for 'warranty' and 'guarantee':", synonyms)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Synonyms for 'warranty' and 'guarantee': ['.', 'quality', 'quality', 'on', 'this', 'product', 'is', 'excellent,', 'and', 'it', 'gives', 'me', 'a', 'headache..']


In [40]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline
import torch
import pandas as pd

# Load pre-trained BERT model for masked language modeling
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load the masked language modeling pipeline
mlm_pipeline = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Assuming your DataFrame is named df
# Replace 'your_dataset.csv' with your actual CSV file

# Function to get synonyms for a given word using the masked language modeling pipeline
def get_synonyms_with_pipeline(word):
    # Replace the word with [MASK] token and predict synonyms using the pipeline
    synonyms = mlm_pipeline(f"This product has a [MASK] that covers any issues with {word}.")
    return [result['token_str'] for result in synonyms]

# Function to get synonyms for a given word using the pre-trained BERT model
def get_synonyms_with_bert(word):
    # Replace the word with [MASK] token and predict synonyms using the pre-trained BERT model
    text = f"The {word} on this product is excellent, and it gives me a {word}."
    text_masked = text.replace(word, "[MASK]")

    # Tokenize the text
    tokens = tokenizer(text_masked, return_tensors="pt")

    # Generate predictions for the masked tokens
    with torch.no_grad():
        outputs = model(**tokens)

    # Get the predicted token IDs
    predicted_token_ids = torch.argmax(outputs.logits, dim=-1)

    # Decode the predicted token IDs to words
    predicted_words = tokenizer.batch_decode(predicted_token_ids)

    # Filter out [CLS], [SEP], and [MASK] tokens
    synonyms = [word for word in predicted_words[0].split() if word not in ['[CLS]', '[SEP]', '[MASK]']]

    return synonyms

# Get synonyms for 'warranty' using the masked language modeling pipeline
warranty_synonyms_pipeline = get_synonyms_with_pipeline('warranty')
print("Synonyms for 'warranty' using pipeline:", warranty_synonyms_pipeline)

# Get synonyms for 'guarantee' using the masked language modeling pipeline
guarantee_synonyms_pipeline = get_synonyms_with_pipeline('guarantee')
print("Synonyms for 'guarantee' using pipeline:", guarantee_synonyms_pipeline)

# Get synonyms for 'warranty' using the pre-trained BERT model
warranty_synonyms_bert = get_synonyms_with_bert('warranty')
print("Synonyms for 'warranty' using BERT:", warranty_synonyms_bert)

# Get synonyms for 'guarantee' using the pre-trained BERT model
guarantee_synonyms_bert = get_synonyms_with_bert('guarantee')
print("Synonyms for 'guarantee' using BERT:", guarantee_synonyms_bert)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Synonyms for 'warranty' using pipeline: ['label', 'package', 'specification', 'feature', 'license']
Synonyms for 'guarantee' using pipeline: ['label', 'package', 'specification', 'policy', 'scope']
Synonyms for 'warranty' using BERT: ['.', 'quality', 'quality', 'on', 'this', 'product', 'is', 'excellent,', 'and', 'it', 'gives', 'me', 'a', 'headache..']
Synonyms for 'guarantee' using BERT: ['.', 'quality', 'quality', 'on', 'this', 'product', 'is', 'excellent,', 'and', 'it', 'gives', 'me', 'a', 'headache..']


In [45]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [48]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Sample sentences for training
sentences = [
    "The product has a great warranty.",
    "I am satisfied with the guarantee provided.",
    "The warranty covers any defects in the product.",
    "The guarantee ensures customer satisfaction.",
    "I trust the warranty on this product."
]

# Tokenize and preprocess the sentences
stop_words = set(stopwords.words('english'))
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Find words similar to 'warranty'
similar_words = model.wv.most_similar('guarantee', topn=5)
print("Words similar to 'warranty':", similar_words)



Words similar to 'warranty': [('great', 0.16694265604019165), ('i', 0.13885025680065155), ('warranty', 0.13151131570339203), ('covers', 0.09763337671756744), ('with', 0.07172605395317078)]


In [49]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

# Load pre-trained BERT model for masked language modeling
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Replace 'warranty' with [MASK] token and predict synonyms
text = "The [MASK] on this product is excellent, and it gives me a guarantee."
text_masked = text.replace("warranty", "[MASK]")

# Tokenize the text
tokens = tokenizer(text_masked, return_tensors="pt")

# Generate predictions for the masked tokens
with torch.no_grad():
    outputs = model(**tokens)

# Get the predicted token IDs
predicted_token_ids = torch.argmax(outputs.logits, dim=-1)

# Decode the predicted token IDs to words
predicted_words = tokenizer.batch_decode(predicted_token_ids)

# Filter out [CLS], [SEP], and [MASK] tokens
similar_words = [word for word in predicted_words[0].split() if word not in ['[CLS]', '[SEP]', '[MASK]']]

print("Words similar to 'warranty':", similar_words)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Words similar to 'warranty': ['.', 'quality', 'quality', 'on', 'this', 'product', 'is', 'excellent,', 'and', 'it', 'gives', 'me', 'a', 'guarantee..']


In [52]:
from gensim.models import FastText
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Sample sentences for training
sentences = [
    "The product has a great warranty.",
    "I am satisfied with the guarantee provided.",
    "The warranty covers any defects in the product.",
    "The guarantee ensures customer satisfaction.",
    "I trust the warranty on this product."
]

# Tokenize and preprocess the sentences
stop_words = set(stopwords.words('english'))
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train FastText model
model = FastText(sentences=tokenized_sentences, vector_size=50, window=10, min_count=1, workers=4)

# Find words similar to 'warranty'
similar_words = model.wv.most_similar('warranty', topn=5)
print("Words similar to 'warranty':", similar_words)


Words similar to 'warranty': [('guarantee', 0.1831463873386383), ('has', 0.12012948840856552), ('the', 0.1056489646434784), ('provided', 0.1030721515417099), ('a', 0.09100858122110367)]
