In [1]:
# Imports
import os
import re
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from bert.tokenization import FullTokenizer
from tensorflow.keras import backend as K
import numpy as np
from tqdm import tqdm
import email

from sentence_transformers import SentenceTransformer, util


In [9]:
df = pd.read_csv("../data/final_car_reviews.csv")
print("Loaded {} cars with {} columns".format(df.shape[0], df.shape[1]))
df.head()
#print(df['Review'][5])
print(df.loc[0]["Review"])


Loaded 167294 cars with 4 columns
beetle convertible 45 year andhave overall happy car compact convertibledo expect big trunk tall people back seat


In [11]:
testDf = df

testDf

Unnamed: 0,Vehicle_Title,Review_Title,Review,Rating
0,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"New Beetle- Holds up well & Fun to Drive, but ...",beetle convertible 45 year andhave overall hap...,4.5
1,2007 Volkswagen New Beetle Convertible 2.5 PZE...,Quality Review,bought car new 2007 generally satisfied mechan...,4.5
2,2007 Volkswagen New Beetle Convertible Triple ...,Adore it,adore new beetle even though male get complime...,4.5
3,2007 Volkswagen New Beetle Convertible 2.5 2dr...,Nice Ragtop,wife chose car replace sebring convertible wan...,4.5
4,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"Luv, luv, luv my dream car",4 carpool 1 way 30 min backseat ok normal size...,5.0
...,...,...,...,...
167289,2008 Chevrolet Equinox SUV LT 4dr SUV AWD (3.4...,Handy & comfortable,took new equinox road trip week got pleasant r...,5.0
167290,2008 Chevrolet Equinox SUV Sport 4dr SUV AWD (...,Pretty Happy Guy,got 2008 equinox sport liking fun drive map po...,4.0
167291,2008 Chevrolet Equinox SUV LTZ 4dr SUV (3.4L 6...,"Great car, all the options, smooth ride",bought ltz black metallic lot research escape ...,4.5
167292,2008 Chevrolet Equinox SUV LT 4dr SUV AWD (3.4...,Great Car,love car first suv never thought would ever wa...,5.0


In [14]:
model = SentenceTransformer('sentence-transformers/paraphrase-distilroberta-base-v1')

def preprocess_text(text: str):
    return re.sub(r'[^A-Za-z0-9]+', ' ', text.strip())

def embedding_fn(sentences):      
    return model.encode(sentences)

def recommend(query, top_n):
    query_embedd = embedding_fn(preprocess_text(query))
    return testDf.Embeddings.apply(
        lambda body: util.pytorch_cos_sim(query_embedd, body).detach().numpy().flatten()
    ).astype(float).nlargest(top_n)

In [12]:
testDf['Review'] = testDf['Review'].astype(str)
testDf['Embeddings'] = testDf['Review'].apply(lambda x: model.encode(x))

testDf


Unnamed: 0,Vehicle_Title,Review_Title,Review,Rating,Embeddings
0,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"New Beetle- Holds up well & Fun to Drive, but ...",beetle convertible 45 year andhave overall hap...,4.5,"[0.22639257, 0.36218312, 0.08677771, -0.002072..."
1,2007 Volkswagen New Beetle Convertible 2.5 PZE...,Quality Review,bought car new 2007 generally satisfied mechan...,4.5,"[0.12528875, 0.50444865, 0.29752502, -0.359071..."
2,2007 Volkswagen New Beetle Convertible Triple ...,Adore it,adore new beetle even though male get complime...,4.5,"[0.29634258, 0.6638004, 0.27571627, -0.1195450..."
3,2007 Volkswagen New Beetle Convertible 2.5 2dr...,Nice Ragtop,wife chose car replace sebring convertible wan...,4.5,"[0.15129209, 0.447633, 0.2971824, -0.6222487, ..."
4,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"Luv, luv, luv my dream car",4 carpool 1 way 30 min backseat ok normal size...,5.0,"[0.19023652, 0.674556, 0.4787647, 0.0641884, 0..."
...,...,...,...,...,...
167289,2008 Chevrolet Equinox SUV LT 4dr SUV AWD (3.4...,Handy & comfortable,took new equinox road trip week got pleasant r...,5.0,"[0.06779692, 0.35908717, 0.309544, -0.08886097..."
167290,2008 Chevrolet Equinox SUV Sport 4dr SUV AWD (...,Pretty Happy Guy,got 2008 equinox sport liking fun drive map po...,4.0,"[0.01645045, 0.22581078, 0.20471416, -0.303288..."
167291,2008 Chevrolet Equinox SUV LTZ 4dr SUV (3.4L 6...,"Great car, all the options, smooth ride",bought ltz black metallic lot research escape ...,4.5,"[0.104743056, 0.08839858, 0.5288735, 0.1699691..."
167292,2008 Chevrolet Equinox SUV LT 4dr SUV AWD (3.4...,Great Car,love car first suv never thought would ever wa...,5.0,"[0.33955178, 0.1906727, -0.014535184, -0.26957..."


In [24]:
search_test = ['family car with a good fuel consumption']
search_test = ['I want a small car with a small fuel consumption and a nice body paint']


query = '\n'.join(search_test)

results = recommend(query, 10)
print(results)

57954     0.665226
58542     0.652322
159578    0.649902
84161     0.641032
105516    0.640417
52145     0.630904
79645     0.623170
103784    0.622454
77093     0.621571
84045     0.621493
Name: Embeddings, dtype: float64


In [26]:
for key, val in results.items():
    print('================================================\n')
    print(f'Score: {"{:.4f}".format(val)}')
    print(f'Car Title: {df.loc[key]["Vehicle_Title"]}')
    print(f'Review Title: {df.loc[key]["Review_Title"]}')
    print(f'Review Title: {df.loc[key]["Review"]}')
    


Score: 0.6652
Car Title: 2014 Hyundai Accent Hatchback GS 4dr Hatchback (1.6L 4cyl 6A)
Review Title: Comfortable Basic Car
Review Title: small cheap car look nice feel comfortable get good gas mileage small improvement order would highly recommend

Score: 0.6523
Car Title: 2013 Hyundai Accent Sedan GLS 4dr Sedan (1.6L 4cyl 6A)
Review Title: Great small car
Review Title: great small car look good get good gas mileage would highly recommends someone looking nice power mileage

Score: 0.6499
Car Title: 2015 Chevrolet Cruze Diesel Diesel 4dr Sedan (2.0L 4cyl Turbodiesel 6A)
Review Title: MyCruze
Review Title: best small car good gas milage

Score: 0.6410
Car Title: 2010 Kia Forte Koup Koup EX 2dr Coupe (2.0L 4cyl 5M)
Review Title: The Red Baron
Review Title: great small car

Score: 0.6404
Car Title: 2001 Ford Focus Sedan ZTS 4dr Sedan (2.0L 4cyl 5M)
Review Title: My ZTS
Review Title: think great little car fun 2 drive many convenience small car could use little power overall great car pri

In [27]:
testDf.to_csv("../data/roberta_reviewsData.csv")

In [28]:
import pickle

In [29]:
def save_bert_tokens_to_pickle(dataframe, column_name, file_path):
    try:
        # Extract the specified column
        bert_tokens_column = dataframe[column_name]

        # Save the column containing BERT tokens to a pickle file
        with open(file_path, 'wb') as file:
            pickle.dump(bert_tokens_column, file)
        
        print(f"Column '{column_name}' containing BERT tokens saved to {file_path} successfully.")

    except KeyError as e:
        print(f"Error: Column '{column_name}' not found in the DataFrame.")
    
    except Exception as e:
        print(f"An error occurred: {e}")



In [30]:
save_bert_tokens_to_pickle(testDf, 'Embeddings', '../data/bert_tokens.pkl')

Column 'Embeddings' containing BERT tokens saved to ../data/bert_tokens.pkl successfully.


In [31]:
unpickled_tokens = pd.read_pickle("../data/bert_tokens.pkl")
unpickled_tokens

0         [0.22639257, 0.36218312, 0.08677771, -0.002072...
1         [0.12528875, 0.50444865, 0.29752502, -0.359071...
2         [0.29634258, 0.6638004, 0.27571627, -0.1195450...
3         [0.15129209, 0.447633, 0.2971824, -0.6222487, ...
4         [0.19023652, 0.674556, 0.4787647, 0.0641884, 0...
                                ...                        
167289    [0.06779692, 0.35908717, 0.309544, -0.08886097...
167290    [0.01645045, 0.22581078, 0.20471416, -0.303288...
167291    [0.104743056, 0.08839858, 0.5288735, 0.1699691...
167292    [0.33955178, 0.1906727, -0.014535184, -0.26957...
167293    [-0.08480321, 0.55719674, 0.121915124, -0.2326...
Name: Embeddings, Length: 167294, dtype: object