### Import libraries and data preprocessing

In [1]:
import re
import unicodedata
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ccsar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

# Limpiamos el texto
preprocess_text('What can I say, I love this place')

'say love place'

In [7]:
## URL from github repo, load as dataframe
url = 'https://raw.githubusercontent.com/ccsarmientot/natural_language_processing/master/datasets/reviews_sample.parquet'
df_reviews = pd.read_parquet(url)
df_reviews.tail()

Unnamed: 0,name,text
1091919,D-Lites,Super impressed by this local shop. The gentle...
271820,Mariner Car Wash,Waited 30 minutes while my car sat to be detai...
607165,Zapata's Mexican Bar & Grill,"What can I say, I love this place. Started o..."
172729,Surf and Turf,"That would be a rip off, lobster sandwich w ba..."
738809,Sushi Ninja Tampa,Pretty Good... I ordered the South Tampa Sushi...


### Modelling using TFIDF

In [14]:
# Importamos librerias
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

X = df_reviews['text']

# Dividimos los datos en entrenamiento y testeo
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [21]:
# Creacion del pipeline
kmeans_model = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(max_features=3000, preprocessor=preprocess_text)),
    ("pca", PCA(n_components=100)),
    ("logit", KMeans(n_clusters=2))
])

In [22]:
kmeans_model.fit(X_train)

In [24]:
y_pred = kmeans_model.predict(X_test)

In [25]:
df_acc = pd.DataFrame({"review": X_test, "cluster": y_pred})
df_acc

Unnamed: 0,review,cluster
322615,great place hike dog visit dog park lots trail...,1
332063,another almost dead ipad battery made appointm...,0
532483,really enjoyed poke bowl thought ingredients f...,1
1002386,sooo good kickin chicken burrito massive delic...,1
867833,sure heard reviews every single nail salon tha...,0
...,...,...
670583,review reference sales associate purchased new...,0
1138671,wife pulled hotel decided go first restaurant ...,1
771196,love place ive salt rock grille times place mu...,1
395443,tyler internet sales manager shah finance mana...,0


### Using embedings

In [16]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Encoding using sentence transformer model
X_train_embed = embedding_model.encode(X_train, batch_size=512)
X_test_embed = embedding_model.encode(X_test, batch_size=512)

In [None]:
# Creacion del pipeline
kmeans_model = Pipeline(steps=[
    ("pca", PCA(n_components=100)),
    ("logit", KMeans(n_clusters=2))
])

kmeans_model.fit(X_train_embed)

In [None]:
y_pred_emb = kmeans_model.predict(X_test_embed)
df_acc = pd.DataFrame({"review": X_test, "cluster": y_pred_emb})
df_acc