# Install Packages

In [1]:
! python3 -m pip install openai selenium pandas textblob plotly

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://ariel_cohen_codar%40mckinsey.com:****@mckinsey.jfrog.io/artifactory/api/pypi/python/simple
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
from openai import OpenAI
from textblob import TextBlob

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

from webdriver_manager.chrome import ChromeDriverManager

import time
import re
import pandas as pd
from tqdm import tqdm

import plotly.express as px
import plotly.graph_objects as go



# Scraping data

In [3]:
# Your Glassdoor credentials
username = "ariel_cohen_codar@mckinsey.com"
password = "McKinsey5"

# Set up Chrome options
options = Options()
options.headless = True  # Adjust as per your requirement

# Set up the Chrome WebDriver with the correct executable path
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Function to check and handle login
def check_and_login(driver, username, password):
    # Check if the email form is present
    if len(driver.find_elements(By.ID, "hardsellUserEmail")) > 0:
        
        # Wait and dismiss the overlay if it appears
        try:
            WebDriverWait(driver, 60).until(
                EC.visibility_of_element_located((By.ID, "onetrust-policy-text"))
            )
            # Add code to dismiss the overlay here
            # Example: driver.find_element(By.CSS_SELECTOR, "dismiss_button_selector").click()
        except Exception as e:
            print("No overlay to dismiss")

        # Fill in the email and submit
        driver.find_element(By.ID, "hardsellUserEmail").send_keys(username)
        driver.find_element(By.CSS_SELECTOR, "[data-test='email-form-button']").click()
        time.sleep(2)  # Wait for password field to appear

        # Fill in the password and submit
        driver.find_element(By.ID, "hardsellUserPassword").send_keys(password)
        driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
        time.sleep(2)  # Wait for the login to complete

def get_sub_ratings(driver, review):
    sub_ratings = {}
    try:
        # Trigger the hover to make the tooltip visible
        caret_element = review.find_element(By.CSS_SELECTOR, "svg.review-details__review-details-module__caret")
        driver.execute_script("arguments[0].scrollIntoView(true);", caret_element)
        driver.execute_script("var evt = new MouseEvent('mouseover', {'view': window, 'bubbles': true, 'cancelable': true}); arguments[0].dispatchEvent(evt);", caret_element)
        
        # Wait for the tooltip to become visible
        time.sleep(2)  # Adjust time as necessary

        # Find all the subrating elements within the tooltip
        subratings_elements = review.find_elements(By.CSS_SELECTOR, ".review-details__review-details-module__subRating")
        for element in subratings_elements:
            try:
                category = element.find_element(By.CSS_SELECTOR, ".review-details__review-details-module__subRatingText").text
                stars = element.find_elements(By.CSS_SELECTOR, ".rating-star__rating-star-module__RatingStarContainer")

                if stars:
                    # Get the style attribute of the last star
                    last_star_style = stars[-1].get_attribute('style')
                    percentage_match = re.search(r'--outline-percentage: (\d+)%', last_star_style)
                    if percentage_match:
                        unfilled_percentage = int(percentage_match.group(1))
                        score = 5 - (unfilled_percentage / 100)
                        sub_ratings[category] = score
            except NoSuchElementException:
                print(f"Subrating element not found for category in this review.")
                pass  # Continue with the next subrating element

    except NoSuchElementException as e:
        print("Caret element or tooltip not found for this review:", e)
    except Exception as e:
        print("An error occurred while extracting subratings:", e)

    return sub_ratings








# List to hold all review data
all_reviews_data = []

# Iterate through each page from P1 to P10
for page_num in range(1,11):
    # Construct the URL for each page
    url = f"https://www.glassdoor.com/Reviews/Bringg-Reviews-E1460556_P{page_num}.htm?filter.iso3Language=eng"
    
    # Navigate to the URL
    driver.get(url)

    # Maximize window and wait for the page to load
    driver.maximize_window()
    time.sleep(2)

    # Check and perform login if necessary
    check_and_login(driver, username, password)

    # Find the review elements on the page
    reviews = driver.find_elements(By.CSS_SELECTOR, ".review-details__review-details-module__topReview")

    # Extract data from each review on the current page
    for review in reviews:
        subratings = get_sub_ratings(driver, review)
        print(f"Extracting review subratings: {subratings}")
        review_dict = {
            "Title": review.find_element(By.CSS_SELECTOR, ".review-details__review-details-module__title").text,
            "Rating": review.find_element(By.CSS_SELECTOR, ".review-details__review-details-module__overallRating").text,
            "Date": review.find_element(By.CSS_SELECTOR, ".review-details__review-details-module__reviewDate").text,
            "Pros": review.find_element(By.CSS_SELECTOR, "[data-test='pros']").text,
            "Cons": review.find_element(By.CSS_SELECTOR, "[data-test='cons']").text,
            **subratings
        }
        all_reviews_data.append(review_dict)

# Close the WebDriver
driver.quit()

# Create a DataFrame from the list of dictionaries
reviews_df = pd.DataFrame(all_reviews_data)


Subrating element not found for category in this review.
Extracting review subratings: {'Work/Life Balance': 5.0, 'Diversity & Inclusion': 5.0, 'Career Opportunities': 4.0, 'Compensation and Benefits': 5.0, 'Senior Management': 5.0}
Subrating element not found for category in this review.
Extracting review subratings: {'Work/Life Balance': 3.0, 'Diversity & Inclusion': 1.0, 'Career Opportunities': 2.0, 'Compensation and Benefits': 2.0, 'Senior Management': 1.0}
Subrating element not found for category in this review.
Extracting review subratings: {'Work/Life Balance': 3.0, 'Diversity & Inclusion': 1.0, 'Career Opportunities': 1.0, 'Compensation and Benefits': 1.0, 'Senior Management': 1.0}
Subrating element not found for category in this review.
Extracting review subratings: {'Work/Life Balance': 4.0, 'Diversity & Inclusion': 4.0, 'Career Opportunities': 4.0, 'Compensation and Benefits': 5.0, 'Senior Management': 4.0}
Caret element or tooltip not found for this review: Message: no such

# Overview of the data scraped and Save File

In [2]:
reviews_df = pd.read_excel("../../../data/Incredibuild/Employees Reviews/reviews_Incredibuild_processed.xlsx")
reviews_df.head()

Unnamed: 0,Title,Rating,Date,Pros,Cons,Work/Life Balance,Diversity & Inclusion,Career Opportunities,Compensation and Benefits,Senior Management,...,overall_sentiment_score,overall_sentiment_category,Career_Opportunity,Management,Compensation,Company_Culture,Work_Life_Balance,Product,Layoffs or Attrition,Communication
0,"Fun, fast growing, a little chaotic",5,2022-05-10,"- New CRO is great, seems to have a big effect...",- Structure is still coming together... be pre...,5.0,5.0,5.0,5.0,5.0,...,0.186295,Positive,No,Yes,No,Yes,No,No,No,Yes
1,"It's Not Just a Job, It's an Adventure!",4,2023-01-05,I am enjoying becoming part of this amazingly ...,"Incredibuild is a global company, with HQ in I...",5.0,4.0,4.0,5.0,5.0,...,0.131534,Positive,Yes,Yes,No,Yes,Yes,No,No,Yes
2,Incredible being an Incredibuilder,5,2022-05-10,Solid product. Amazing logos that use it. Grea...,None that I can think of right now,5.0,5.0,5.0,5.0,5.0,...,0.227273,Positive,No,Yes,No,No,No,Yes,No,No
3,"Diversity, Balance, and Opportunities",5,2023-08-24,Diversity: One of the standout features of Inc...,"Workload: At times, the dynamic nature of the ...",5.0,5.0,5.0,4.0,5.0,...,-0.093764,Negative,Yes,Yes,No,Yes,Yes,No,No,No
4,Insights,5,2023-09-28,- Great Work Environment - Career Growth Oppor...,Can't think on special cons,5.0,5.0,5.0,5.0,5.0,...,0.217424,Positive,Yes,No,No,Yes,Yes,No,No,No


# Exploratory Data Analysis

## General EDA on reviews score and data quality

In [3]:
# Convert 'Rating' from string to numeric
reviews_df['Rating'] = pd.to_numeric(reviews_df['Rating'])

# Convert 'Date' to datetime
reviews_df['Date'] = pd.to_datetime(reviews_df['Date'], format='%b %d, %Y')

# Basic statistics
print("Basic Statistics:")
print(reviews_df.describe())

# Data quality check
print("\nData Quality Check:")
print(reviews_df.info())

# Number of Reviews and Date Range
print("\nNumber of Reviews:", reviews_df.shape[0])
print("Date Range:", reviews_df['Date'].min(), "to", reviews_df['Date'].max())

# Average Rating
print("\nAverage Rating:", reviews_df['Rating'].mean())


Basic Statistics:
          Rating                           Date  Work/Life Balance  \
count  43.000000                             43          37.000000   
mean    4.348837  2022-04-10 02:47:26.511628032           4.540541   
min     1.000000            2019-12-11 00:00:00           3.000000   
25%     4.000000            2021-04-22 00:00:00           4.000000   
50%     5.000000            2022-07-06 00:00:00           5.000000   
75%     5.000000            2022-11-15 00:00:00           5.000000   
max     5.000000            2023-09-28 00:00:00           5.000000   
std     0.973059                            NaN           0.690997   

       Diversity & Inclusion  Career Opportunities  Compensation and Benefits  \
count              35.000000             36.000000                  36.000000   
mean                4.485714              4.000000                   4.222222   
min                 2.000000              1.000000                   2.000000   
25%                 4.00000

In [4]:
# Visualization of Rating Distribution
fig = px.histogram(reviews_df, x='Rating', nbins=30, title='Distribution of Ratings')
fig.update_layout(bargap=0.1)
fig.show()

In [5]:
# Ensure 'Date' is in datetime format
reviews_df['Date'] = pd.to_datetime(reviews_df['Date'])

# Convert 'Date' to a 'Month_Year' format for aggregation
reviews_df['Month_Year'] = reviews_df['Date'].dt.to_period('M')

# Count the number of reviews each month
monthly_review_counts = reviews_df.groupby('Month_Year').size()

# Calculate the cumulative number of reviews
cumulative_reviews = monthly_review_counts.cumsum()

# Create the figure for cumulative reviews over time
fig = go.Figure(data=go.Scatter(x=cumulative_reviews.index.astype(str), y=cumulative_reviews.values, mode='lines+markers', name='Cumulative Reviews'))

# Update layout
fig.update_layout(title='Cumulative Number of Reviews Over Time', xaxis_title='Month and Year', yaxis_title='Cumulative Number of Reviews')

# Show the plot
fig.show()

In [7]:
# Length of Reviews
reviews_df['Review_Length'] = reviews_df['Pros'].str.len() + reviews_df['Cons'].str.len()
fig = px.histogram(reviews_df, x='Review_Length', nbins=30, title='Distribution of Review Lengths')
fig.update_layout(bargap=0.1)
fig.show()

# Sentiment analysis

## Using Textblob

In [9]:
# Function to calculate sentiment
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity if pd.notna(text) else 0

# Apply function to 'Pros' and 'Cons'
reviews_df['Pros_Sentiment'] = reviews_df['Pros'].apply(calculate_sentiment)
reviews_df['Cons_Sentiment'] = reviews_df['Cons'].apply(calculate_sentiment)

In [10]:
# Overall sentiment score (average of Pros and Cons)
reviews_df['Overall_Sentiment'] = reviews_df[['Pros_Sentiment', 'Cons_Sentiment']].mean(axis=1)

# Overview of sentiment scores
print(reviews_df[['Pros_Sentiment', 'Cons_Sentiment', 'Overall_Sentiment']].describe())

       Pros_Sentiment  Cons_Sentiment  Overall_Sentiment
count       92.000000       92.000000          92.000000
mean         0.418952        0.100030           0.259491
std          0.268998        0.290969           0.202301
min         -0.645833       -0.700000          -0.312500
25%          0.254687        0.000000           0.148672
50%          0.401429        0.020556           0.253541
75%          0.640938        0.200000           0.387500
max          0.866667        1.000000           0.933333


In [11]:
# Visualization: Sentiment Score Distribution
fig = px.histogram(reviews_df, x='Overall_Sentiment', nbins=30, title='Overall Sentiment Score Distribution')
fig.update_layout(bargap=0.1)
fig.show()

In [12]:
# Visualization: Average Sentiment Score by Rating
sentiment_by_rating = reviews_df.groupby('Rating')['Overall_Sentiment'].mean()
fig = go.Figure(data=[go.Bar(x=sentiment_by_rating.index, y=sentiment_by_rating.values)])
fig.update_layout(title='Average Overall Sentiment Score by Rating', xaxis_title='Rating', yaxis_title='Average Sentiment Score')
fig.show()

In [13]:
# Categorizing reviews as positive, negative, or neutral
reviews_df['Sentiment_Category'] = pd.cut(reviews_df['Overall_Sentiment'], bins=[-1, -0.01, 0.01, 1], labels=['Negative', 'Neutral', 'Positive'])

In [14]:
# Visualization: Average Review Length by Sentiment Category
average_lengths = reviews_df.groupby('Sentiment_Category')['Review_Length'].mean()
fig = go.Figure(data=[go.Bar(x=average_lengths.index, y=average_lengths.values)])
fig.update_layout(title='Average Review Length by Sentiment Category', xaxis_title='Sentiment Category', yaxis_title='Average Review Length')
fig.show()





In [15]:
# Visualization: Sentiment Over Time (assuming 'Date' is already in datetime format)
reviews_df['Month_Year'] = reviews_df['Date'].dt.to_period('M')
monthly_sentiment = reviews_df.groupby('Month_Year')['Overall_Sentiment'].mean()
fig = go.Figure(data=go.Scatter(x=monthly_sentiment.index.astype(str), y=monthly_sentiment.values))
fig.update_layout(title='Average Overall Sentiment Over Time', xaxis_title='Month and Year', yaxis_title='Average Sentiment Score')
fig.show()

## Using Open AI

In [16]:
client = OpenAI(
  api_key="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",
)

In [17]:
# Function to get model response
def get_model_response(messages, model='gpt-4', temperature=0.5, max_tokens=500):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        n=1
    )
    interpretation = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), '')
    return interpretation

In [18]:
# Adding new columns for sentiment analysis
reviews_df['Pros_Sentiment'] = ''
reviews_df['Cons_Sentiment'] = ''

# Analyze sentiment for each review
for index, row in tqdm(reviews_df.iterrows(), total=reviews_df.shape[0]):
    # Sentiment Analysis for Pros
    if pd.notna(row['Pros']):
        pros_sentiment_messages = [
            {'role': 'system', 'content': 'You are a helpful assistant that analyzes text sentiment.'},
            {'role': 'user', 'content': f'Please analyze the sentiment of this statement: "{row["Pros"]}"'}
        ]
        pros_sentiment_response = get_model_response(pros_sentiment_messages)
        reviews_df.at[index, 'Pros_Sentiment'] = pros_sentiment_response
    #Wait between requests
    time.sleep(2)

    # Sentiment Analysis for Cons
    if pd.notna(row['Cons']):
        cons_sentiment_messages = [
            {'role': 'system', 'content': 'You are a helpful assistant that analyzes text sentiment.'},
            {'role': 'user', 'content': f'Please analyze the sentiment of this statement: "{row["Cons"]}"'}
        ]
        cons_sentiment_response = get_model_response(cons_sentiment_messages)
        reviews_df.at[index, 'Cons_Sentiment'] = cons_sentiment_response
    #Wait between requests
    time.sleep(2)

  0%|          | 0/92 [00:00<?, ?it/s]

100%|██████████| 92/92 [13:14<00:00,  8.63s/it]


In [19]:
# Adding new column for overall sentiment analysis
reviews_df['Overall_Sentiment'] = ''

# Analyze overall sentiment for each review
for index, row in tqdm(reviews_df.iterrows(), total=reviews_df.shape[0]):
    # Concatenate Pros and Cons
    review_text = f"{row['Pros']} {row['Cons']}"
    
    # Sentiment Analysis
    sentiment_messages = [
        {'role': 'system', 'content': 'You are a helpful assistant that analyzes text sentiment.'},
        {'role': 'user', 'content': f'Please analyze the sentiment of this review: "{review_text}"'}
    ]
    sentiment_response = get_model_response(sentiment_messages)
    
    # Store responses in DataFrame
    reviews_df.at[index, 'Overall_Sentiment'] = sentiment_response

    #Wait between requests
    time.sleep(2)

100%|██████████| 92/92 [07:55<00:00,  5.17s/it]


In [20]:
# Function to calculate sentiment score
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity if pd.notna(text) else 0

# compute overall sentiment score
reviews_df['overall_sentiment_score'] = reviews_df['Overall_Sentiment'].apply(calculate_sentiment)

# Categorizing reviews as positive, negative, or neutral
reviews_df['overall_sentiment_category'] = pd.cut(reviews_df['overall_sentiment_score'], bins=[-1, -0.01, 0.01, 1], labels=['Negative', 'Neutral', 'Positive'])

## Check the output dataframe

In [21]:
reviews_df.head()

Unnamed: 0,Title,Rating,Date,Pros,Cons,Work/Life Balance,Diversity & Inclusion,Career Opportunities,Compensation and Benefits,Senior Management,Month_Year,Review_Length,Pros_Sentiment,Cons_Sentiment,Overall_Sentiment,Sentiment_Category,overall_sentiment_score,overall_sentiment_category
0,Great culture,5.0,2023-04-24,"Lots of trips, incentives and fun environment",Company not doing well. Layoffs over night wit...,5.0,5.0,4.0,5.0,5.0,2023-04,102,The sentiment of this statement is positive. T...,The sentiment of this statement is negative. I...,The sentiment of this review is mixed. The rev...,Positive,0.056818,Positive
1,Poor Leadership and Bad Prioritization of Prod...,2.0,2023-09-01,Primarily remote work for US based employees T...,- All talk and no action CEO - Israeli team ha...,3.0,1.0,2.0,2.0,1.0,2023-09,413,The sentiment of this statement is positive. T...,The sentiment of this statement is strongly ne...,The sentiment of this review is predominantly ...,Positive,0.097197,Positive
2,Avoid At All Costs,1.0,2022-12-10,"Honestly, none. The pay was good at the start ...",HR team is a complete and utter joke there. Do...,3.0,1.0,1.0,1.0,1.0,2022-12,997,The sentiment of this statement is primarily n...,The sentiment of this statement is highly nega...,The sentiment of this review is strongly negat...,Positive,-0.06381,Negative
3,Overhired in 2022,4.0,2022-12-06,They have done a good job on culture and pay v...,They are trying to grow to fast as they hired ...,4.0,4.0,4.0,5.0,4.0,2022-12,159,The sentiment of this statement is positive. T...,The sentiment of this statement is negative. I...,The sentiment of this review is mixed. The fir...,Positive,0.197159,Positive
4,Good place to work,5.0,2022-11-21,A Good place to work,None that I really can think of,,,,,,2022-11,51,"The sentiment of the statement ""A Good place t...","The sentiment of the statement ""None that I re...",The sentiment of this review is positive. The ...,Positive,0.463636,Positive


## Visualization

In [22]:
# Visualization: Distribution of Sentiment Categories
fig = px.histogram(reviews_df, x='overall_sentiment_category', title='Distribution of Sentiment Categories')
fig.update_layout(bargap=0.2)
fig.show()


In [23]:
# Convert 'Date' to datetime (if not already done)
reviews_df['Date'] = pd.to_datetime(reviews_df['Date'], format='%Y-%m-%d')

# Aggregate sentiment by month
reviews_df['Month_Year'] = reviews_df['Date'].dt.to_period('M')
monthly_sentiment = reviews_df.groupby(['Month_Year', 'overall_sentiment_category']).size().unstack(fill_value=0)

# Visualization: Sentiment Evolution Over Time
fig = go.Figure()
for sentiment_category in monthly_sentiment.columns:
    fig.add_trace(go.Scatter(x=monthly_sentiment.index.astype(str), y=monthly_sentiment[sentiment_category], mode='lines', name=sentiment_category))

fig.update_layout(title='Sentiment Evolution Over Time', xaxis_title='Month and Year', yaxis_title='Number of Reviews', legend_title='Sentiment Category')
fig.show()





# Topics modeling

## Using OpenAI

First cell concatenate all the reviews and generate reviews for all the dimensions/topics at once

In [24]:
# Concatenate a subset of reviews to avoid token limits
concatenated_reviews = ' '.join(reviews_df['Pros'] + ' ' + reviews_df['Cons'])

# Prepare the prompt for analysis
analysis_topics = ["Career Opportunity", "Management", "Compensation", "Company Culture", "Work Life Balance"]
prompt_text = f"Analyze these employee reviews and provide detailed insights on the following topics: {', '.join(analysis_topics)}: {concatenated_reviews}"

# Create messages for the ChatGPT model
analysis_messages = [
    {'role': 'system', 'content': 'You are a helpful assistant that analyzes text sentiment and content.'},
    {'role': 'user', 'content': prompt_text}
]

# Get detailed analysis based on the reviews
detailed_analysis_all = get_model_response(analysis_messages)

The second cell is spliting the request to isolate each topic analysis

In [25]:
# Prepare the prompt for analysis
analysis_topics = ["Career Opportunity", "Management", "Compensation", "Company Culture", "Work Life Balance"]

detail_analysis = {}

# Perform analysis for each topic
for topic in analysis_topics:
    message_ = [
        {'role': 'system', 'content': 'You are a helpful assistant that analyzes text sentiment and content.'},
        {'role': 'user', 'content': f'Analyze these employee reviews and provide detailed insights on the following topic: {topic}: {concatenated_reviews}'}
    ]
    detail_analysis[topic] = get_model_response(message_)
    time.sleep(180)

The final cell about topic analysis if matching the reviews with the topics to back up the insights generated in the cell above

In [28]:
# Adding new columns for each category of detailed analysis
analysis_categories = [
    'Career_Opportunity', 'Management', 'Compensation', 'Company_Culture',
    'Work_Life_Balance', 'Product', 'Layoffs or Attrition', 'Communication'
]

for category in analysis_categories:
    reviews_df[category] = ''

# Analyze each review for the mentioned categories
for index, row in tqdm(reviews_df.iterrows(), total=reviews_df.shape[0]):
    review_text = f"{row['Pros']} {row['Cons']}"

    for category in analysis_categories:
        messages = [
            {'role': 'system', 'content': 'You are a helpful assistant that analyzes text sentiment.'},
            {'role': 'user', 'content': f'Please check if this review talks about "{category.lower()}" in the company: "{review_text}". Answer with "Yes" or "No"'}
        ]
        response = get_model_response(messages)
        reviews_df.at[index, category] = response

        # Wait between requests to avoid hitting API rate limits
        time.sleep(2)

# Save the dataset with categorized comments
reviews_df.to_excel("../../../data/reviews_Bringg_processed.xlsx", index=False)


  0%|          | 0/92 [00:00<?, ?it/s]

100%|██████████| 92/92 [10:21:47<00:00, 405.51s/it]    


## Distribution of themes score over time

The first visualization is the cumulative average score over time for each theme

In [29]:
# Columns of interest
score_columns = ["Work/Life Balance", "Diversity & Inclusion", 
                 "Career Opportunities", "Compensation and Benefits", 
                 "Senior Management"]


In [30]:
# Creating individual plots for each score category
for column in score_columns:
    # Group by 'Month_Year' and calculate the mean, ignoring NaN values
    monthly_scores = reviews_df.groupby('Month_Year')[column].mean()

    # Calculate the cumulative average
    cumulative_avg = monthly_scores.expanding().mean()

    # Resample to fill missing months with the last known cumulative average
    cumulative_avg_filled = cumulative_avg.resample('M').ffill()

    # Create the figure
    fig = go.Figure(data=go.Scatter(x=cumulative_avg_filled.index.astype(str), y=cumulative_avg_filled.values, 
                                    mode='lines', name='Cumulative Average'))

    # Update layout
    fig.update_layout(title=f'Cumulative Average of {column} Over Time', 
                      xaxis_title='Month and Year', 
                      yaxis_title=f'Cumulative Average {column} Score')

    # Show the plot
    fig.show()

Then the following code can isolate the score given each month regardless of past scores

In [31]:
# Creating individual plots for each score category
for column in score_columns:
    # Group by 'Month_Year' and calculate the mean, ignoring NaN values
    monthly_scores = reviews_df.groupby('Month_Year')[column].mean()

    # Resample to fill missing months with NaN and then forward fill
    monthly_scores_filled = monthly_scores.resample('M').asfreq().ffill()

    # Create two series: one for original data, another for filled data
    original_data = monthly_scores_filled[monthly_scores_filled.index.isin(monthly_scores.index)]
    filled_data = monthly_scores_filled[~monthly_scores_filled.index.isin(monthly_scores.index)]

    # Create the figure
    fig = go.Figure()

    # Add the original data trace
    fig.add_trace(go.Scatter(x=original_data.index.astype(str), y=original_data.values, 
                             mode='lines', name='Original Data'))

    # Add the filled data trace
    #fig.add_trace(go.Scatter(x=filled_data.index.astype(str), y=filled_data.values, 
    #                         mode='lines', name='Filled Data', line=dict(dash='dash')))

    # Update layout
    fig.update_layout(title=f'Average {column} Score Over Time (with Filled Data)', 
                      xaxis_title='Month and Year', 
                      yaxis_title=f'Average {column} Score')

    # Show the plot
    fig.show()


CONCLUSION:

