In [7]:
from pyspark.ml.feature import CountVectorizer, Tokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA
from pyspark.sql import SparkSession
import pyLDAvis

from pyspark.sql.functions import col
from gensim import corpora, models
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import product

In [2]:
# Initialize Spark session
spark = SparkSession.builder.appName("LDATopicModeling").getOrCreate()

23/11/19 15:20:33 WARN Utils: Your hostname, Liangchengs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.249.113.255 instead (on interface en0)
23/11/19 15:20:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/19 15:20:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load the JSON data into a DataFrame
json_file_path = "cellphone_clean.json"
cellphone_df_cleaned = spark.read.json(json_file_path)
reviews_df = cellphone_df_cleaned.select("cleaned_review", "overall")

## Reviews by Sentiments

In [4]:
# Separate reviews based on ratings
positive = reviews_df.filter(reviews_df["overall"] > 3)
neutral = reviews_df.filter(reviews_df["overall"] == 3)
negative = reviews_df.filter(reviews_df["overall"] < 3)

# Convert to term frequency vectors
cv = CountVectorizer(inputCol="cleaned_review", outputCol="features", vocabSize=1000)
cv_model = cv.fit(reviews_df)
reviews_df = cv_model.transform(reviews_df)

positive_reviews = cv.fit(positive).transform(positive)
neutral_reviews = cv.fit(neutral).transform(neutral)
negative_reviews = cv.fit(negative).transform(negative)


23/11/19 15:20:39 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [5]:
# Extract tokenized reviews and convert them into a list of words
positive_tokenized_reviews = positive_reviews.select("cleaned_review").rdd.map(lambda x: x.cleaned_review).collect()

neutral_tokenized_reviews = neutral_reviews.select("cleaned_review").rdd.map(lambda x: x.cleaned_review).collect()

negative_tokenized_reviews = negative_reviews.select("cleaned_review").rdd.map(lambda x: x.cleaned_review).collect()

# Create dictionaries and corpora for each sentiment
positive_dictionary = corpora.Dictionary(positive_tokenized_reviews)
neutral_dictionary = corpora.Dictionary(neutral_tokenized_reviews)
negative_dictionary = corpora.Dictionary(negative_tokenized_reviews)

positive_corpus = [positive_dictionary.doc2bow(review) for review in positive_tokenized_reviews]
neutral_corpus = [neutral_dictionary.doc2bow(review) for review in neutral_tokenized_reviews]
negative_corpus = [negative_dictionary.doc2bow(review) for review in negative_tokenized_reviews]


Positive Topics:


## Hyperparameter tuning

In [None]:
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
import matplotlib.pyplot as plt
from itertools import product

def tune_lda_hyperparameters(corpus, dictionary, texts, num_topics_list, alpha_values, eta_values, passes_values):
    """
    Tune LDA hyperparameters and find the optimal combination using perplexity and coherence scores.

    Parameters:
    - corpus: Gensim corpus
    - dictionary: Gensim dictionary
    - texts: List of tokenized texts
    - num_topics_list: List of candidate values for num_topics
    - alpha_values: List of candidate values for alpha or ['symmetric', 'asymmetric']
    - eta_values: List of candidate values for eta
    - passes_values: List of candidate values for passes

    Returns:
    - best_params: Dictionary containing the optimal hyperparameters
    """

    best_coherence_score = -float('inf')
    best_params = {}

    for num_topics, alpha, eta, passes in tqdm(product(num_topics_list, alpha_values, eta_values, passes_values),
                                               total=len(num_topics_list) * len(alpha_values) * len(eta_values) * len(passes_values),
                                               desc="Tuning LDA Hyperparameters"):
        # Train LDA model
        lda_model = LdaModel(corpus, num_topics=num_topics, alpha=alpha, eta=eta, passes=passes, id2word=dictionary)

        # Calculate coherence
        coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence = coherence_model.get_coherence()

        # Check if the current combination has a higher coherence score
        if coherence > best_coherence_score:
            best_coherence_score = coherence
            best_params = {
                'num_topics': num_topics,
                'alpha': alpha,
                'eta': eta,
                'passes': passes
            }

    return best_params, best_coherence_score

In [None]:
num_topics_list = [3, 4, 5]
alpha_values = ['symmetric', 'asymmetric', 0.1, 0.5, 1.0]
eta_values = [ 0.1, 0.5, 1.0]
passes_values = [5, 10, 15]

### Positive reviews

In [21]:
best_params_positive,best_coherence_score = tune_lda_hyperparameters(positive_corpus, positive_dictionary, positive_tokenized_reviews,
                                                num_topics_list, alpha_values, eta_values, passes_values)
print("Optimal hyperparameters for positive sentiment:")
print(best_params_positive)

print(f"Best coherence for positive sentiment: {best_coherence_score}")

Optimal hyperparameters for positive sentiment:
{'num_topics': 5, 'alpha': 0.1, 'eta': 1.0, 'passes': 5}
Best coherence for positive sentiment: 0.51055295340953


In [15]:
lda_model_positive = LdaModel(positive_corpus, id2word=positive_dictionary, **best_params_positive)

# Visualize the topics using pyLDAvis
vis_data_positive = gensimvis.prepare(lda_model_positive, positive_corpus, positive_dictionary)
pyLDAvis.display(vis_data_positive)

In [16]:
pyLDAvis.save_html(vis_data_positive , 'positive.html')

### Neutral Reviews

In [8]:
best_params_neutral,best_coherence_neutral = tune_lda_hyperparameters(neutral_corpus, neutral_dictionary, neutral_tokenized_reviews,
                                                num_topics_list, alpha_values, eta_values, passes_values)
print("Optimal hyperparameters for neutral sentiment:")
print(best_params_neutral)

print(f"Best coherence for neutral sentiment: {best_coherence_neutral}")


Tuning LDA Hyperparameters: 100%|█████████████| 135/135 [08:32<00:00,  3.80s/it]

Optimal hyperparameters for neutral sentiment:
{'num_topics': 5, 'alpha': 'asymmetric', 'eta': 1.0, 'passes': 15}
Best coherence for neutral sentiment: 0.4549683883570156





In [9]:
lda_model_neutral = LdaModel(neutral_corpus, id2word=neutral_dictionary, **best_params_neutral)

# Visualize the topics using pyLDAvis
vis_data_neutral = gensimvis.prepare(lda_model_neutral, neutral_corpus, neutral_dictionary)
pyLDAvis.display(vis_data_neutral)

In [None]:
pyLDAvis.save_html(vis_data_neutral, 'neutral.html')

### Negative reviews

In [11]:
best_params_negative,best_coherence_negative = tune_lda_hyperparameters(negative_corpus, negative_dictionary, negative_tokenized_reviews,
                                                num_topics_list, alpha_values, eta_values, passes_values)
print("Optimal hyperparameters for negative sentiment:")
print(best_params_negtive)

print(f"Best coherence for negative sentiment: {best_coherence_negative}")


Tuning LDA Hyperparameters: 100%|█████████████| 135/135 [16:11<00:00,  7.19s/it]

Optimal hyperparameters for negtive sentiment:
{'num_topics': 5, 'alpha': 'symmetric', 'eta': 1.0, 'passes': 15}
Best coherence for negtive sentiment: 0.44223401246763094





In [19]:
lda_model_negative = LdaModel(negative_corpus, id2word=negative_dictionary, **best_params_negtive)

# Visualize the topics using pyLDAvis
vis_data_negtive = gensimvis.prepare(lda_model_negative, negative_corpus, negative_dictionary)
pyLDAvis.display(vis_data_negtive)

In [None]:
pyLDAvis.save_html(vis_data_negative , 'negative.html')

In [None]:
# Stop Spark session
spark.stop()