In [1]:
import re
import random
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from collections import Counter
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
from scipy.stats import ttest_ind

import sys
sys.path.append('../utils')
import sentiment_utils

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

In [4]:
films = pd.read_csv('../data/clean/letterboxd_clean_films.csv')

# load backup files
films_exploded_genres = pd.read_csv('../data/local/raw/letterboxd/films_exploded_genres.csv')
films_exploded_languages = pd.read_csv('../data/local/raw/letterboxd/films_exploded_languages.csv')
films_exploded_countries = pd.read_csv('../data/local/raw/letterboxd/films_exploded_countries.csv')
films_exploded_events = pd.read_csv('../data/local/raw/letterboxd/films_exploded_events.csv')
films_exploded_themes = pd.read_csv('../data/local/raw/letterboxd/films_exploded_themes.csv')

In [None]:
films.head(2)

In [6]:
films.dropna(subset=['genres', 'language', 'countries', 'director'], inplace=True)
cleaned_films = films.copy()
cleaned_films.dropna(inplace=True)

In [8]:
theme_counts = films_exploded_themes['themes'].value_counts().reset_index()
theme_ratings = films_exploded_themes.groupby('themes').agg(
    average_rating=('letterboxd_rating', 'mean'),
    count=('letterboxd_rating', 'count')
).reset_index()

Preprocess Text

In [9]:
# # Function to preprocess the text
# def preprocess_text(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabet characters
#     tokens = word_tokenize(text)  # Tokenize text
#     stop_words = set(stopwords.words('english'))  # Remove stopwords
#     tokens = [word for word in tokens if word not in stop_words]
#     return tokens

# # Function to analyze the most common words and plot a pie chart
# def analyze_most_common_words(df, text_column, top_n=50, save_path=None):
#     processed_column = f"processed_{text_column}"
    
#     # Preprocess the text column
#     df[processed_column] = df[text_column].apply(preprocess_text)  # Assumes preprocess_text is defined
    
#     # Flatten all tokens into a single list
#     all_tokens = [word for tokens in df[processed_column] for word in tokens]
#     total_word_count = len(all_tokens)
    
#     # Count the most common words
#     word_freq = Counter(all_tokens)
#     most_common_words = word_freq.most_common(top_n)
    
#     # Print results
#     print(f"Top {top_n} most common words in '{text_column}':\n")
#     print(f"{'Word':<15}{'Count':<10}{'Percentage (%)':<10}")
#     print("-" * 35)
#     for word, count in most_common_words:
#         percentage = (count / total_word_count) * 100
#         print(f"{word:<15}{count:<10}{percentage:.2f}")
    
#     # Prepare data for Plotly
#     words, counts = zip(*most_common_words)
#     words_capitalized = [word.capitalize() for word in words]  # Capitalize the first letter of each word
#     percentages = [round((count / total_word_count) * 100, 2) for count in counts]
    
#     fig = go.Figure(data=[go.Pie(
#         labels=words_capitalized,
#         values=counts,
#         hoverinfo='label+percent',
#         textinfo='label+percent',
#         textposition='outside',
#         pull=[0.1] * len(words),
#         marker=dict(colors=[
#             '#F4D6A0',
#             '#A8CBB7',
#             '#D4B9A3',
#             '#A6C6D9',
#             '#B3A0A1',
#             '#6E7B7A',
#             '#99A7A4',
#             '#C4D8C1',
#             '#3E4A49',
#             '#B8C6D0',
#         ])
#     )])

#     fig.update_layout(
#         title=f'Top Recurring Words in Events',
#         showlegend=True, 
#         plot_bgcolor='#f7f7f7', 
#         paper_bgcolor='#f7f7f7',
#         width=800
#     )
    
#     if save_path:
#         fig.write_image(save_path)
#         print(f"Plot saved to {save_path}")
        
#     fig.show()
    
#     return most_common_words

# analyze_most_common_words(cleaned_films, text_column='events', top_n=10)

#### Scatter plot to show the correlation between themes and average ratings

In [11]:
# theme_ratings = films_exploded_themes.groupby('themes')['letterboxd_rating'].mean().reset_index()

# fig = px.scatter(theme_ratings, 
#                  x='themes',   
#                  y='letterboxd_rating',  
#                  title="Average Rating Correlation for Each Theme",
#                  labels={'letterboxd_rating': 'Average Rating', 'themes': 'Film Themes'},
#                  hover_data=['letterboxd_rating', 'themes'],  
#                  color='themes',  
#                  color_discrete_sequence=px.colors.qualitative.Set3)  


# fig.update_layout(
#     xaxis_title="Themes",
#     yaxis_title="Average Rating",
#     showlegend=True, 
#     plot_bgcolor='white',
#     height=800,  
#     margin=dict(l=100, r=100, t=50, b=150),  
#     xaxis_tickangle=70  
# )

# fig.show()

#### Word Cloud of Most Frequent Themes

In [12]:
# theme_counts_dict = dict(zip(theme_counts['themes'], theme_counts['count']))

# # Custom color palette
# custom_colors = [
#     '#F4D6A0', '#A8CBB7', '#D4B9A3', '#A6C6D9', '#B3A0A1',
#     '#6E7B7A', '#99A7A4', '#C4D8C1', '#3E4A49', '#B8C6D0',
#     '#FFC857', '#FFE156', '#71A9F7', '#FF7F51', '#B5838D',
#     '#6B705C', '#DDBEA9', '#A5A58D', '#CCD5AE', '#E63946'
# ]

# # Define a custom color function based on the custom colors list
# def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
#     # Create a mapping of words to the custom color list
#     color_idx = list(theme_counts_dict.keys()).index(word) % len(custom_colors)
#     return custom_colors[color_idx]

# # Generate the word cloud
# wordcloud = WordCloud(
#     width=800, 
#     height=400, 
#     background_color='white', 
#     color_func=color_func  # Apply custom color function
# ).generate_from_frequencies(theme_counts_dict)

# # Display the word cloud
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')  # Turn off axis
# plt.title('Word Cloud of Most Frequent Themes')
# plt.show()

In [13]:
# # Filter the top 20 themes by count
# top_20_themes = theme_ratings.nlargest(20, 'count')

# # Define the custom color palette
# custom_colors = [
#     '#F4D6A0', '#A8CBB7', '#D4B9A3', '#A6C6D9', '#B3A0A1',
#     '#6E7B7A', '#99A7A4', '#C4D8C1', '#3E4A49', '#B8C6D0',
#     '#FFC857', '#FFE156', '#71A9F7', '#FF7F51', '#B5838D',
#     '#6B705C', '#DDBEA9', '#A5A58D', '#CCD5AE', '#E63946'
# ]

# # Create the scatter plot for the top 20 themes
# fig = px.scatter(top_20_themes,
#                  x='average_rating', 
#                  y='themes',  
#                  size='count',  # Size of the bubbles based on theme frequency
#                  color='themes',  
#                  title="Top 20 Themes: Frequency vs. Average Rating",
#                  labels={'average_rating': 'Average Rating', 'themes': 'Film Themes'},
#                  color_discrete_sequence=custom_colors)

# fig.update_layout(
#     xaxis_title="Average Rating",
#     yaxis_title="Themes",
#     showlegend=False,  # Hide legend to avoid clutter
#     plot_bgcolor='white',
#     height=800,
#     width=1000,
#     xaxis_tickangle=45
# )

# fig.show()


In [14]:
# # Filter the top 20 themes by count
# top_20_themes = theme_ratings.nlargest(20, 'count')

# # Define a more refined, muted yet vivid color palette
# refined_colors = [
#     '#E27D60', '#85C1E9', '#6C5B7B', '#FFCF56', '#A9DFBF',
#     '#F5B041', '#5DADE2', '#AF7AC5', '#F1948A', '#7FB3D5',
#     '#58D68D', '#EB984E', '#85C1E9', '#48C9B0', '#D4AC0D',
#     '#85929E', '#B2BABB', '#D68910', '#E74C3C', '#7DCEA0'
# ]

# # Create the scatter plot for the top 20 themes
# fig = px.scatter(top_20_themes,
#                  x='average_rating', 
#                  y='themes',  
#                  size='count',  # Size of the bubbles based on theme frequency
#                  color='themes',  
#                  title="Top 20 Themes: Frequency vs. Average Rating",
#                  labels={'average_rating': 'Average Rating', 'themes': 'Film Themes'},
#                  color_discrete_sequence=refined_colors)

# fig.update_layout(
#     xaxis_title="Average Rating",
#     yaxis_title="Themes",
#     showlegend=True,  # Show the legend
#     plot_bgcolor='white',
#     height=800,
#     width=1000,
#     xaxis_tickangle=45,
#     xaxis=dict(tickformat='.2f')  # Show full rating values
# )

# fig.show()


In [None]:
# Filter the top 15 themes by count
top_15_themes_list = theme_ratings.nlargest(15, 'count')['themes']
top_15_films = films_exploded_themes[films_exploded_themes['themes'].isin(top_15_themes_list)]

custom_colors = [
    '#F4D6A0', '#A8CBB7', '#D4B9A3', '#A6C6D9', '#B3A0A1',
    '#6E7B7A', '#99A7A4', '#C4D8C1', '#3E4A49', '#B8C6D0',
    '#FFC857', '#FFE156', '#71A9F7', '#FF7F51', '#B5838D',
    '#6B705C', '#DDBEA9', '#A5A58D', '#CCD5AE', '#E63946'
]

fig = px.box(top_15_films,
             x='themes', 
             y='letterboxd_rating',
             color='themes',
             title='Rating Distribution for Most Recurring Themes',
             labels={'letterboxd_rating': 'Letterboxd Rating', 'themes': 'Themes'},
             color_discrete_sequence=custom_colors)

fig.update_layout(
    xaxis_title='',
    yaxis_title='Rating',
    plot_bgcolor='#f7f7f7',
    paper_bgcolor='#f7f7f7',
    height=700,
    width=1300,
    xaxis_tickangle=30,
    showlegend=False
)

fig.show()

# image_path = '../visuals/rating_dist_top_15_themes.png'
# fig.write_image(image_path)

### Step by Step

Group Data

In [None]:
# Group by 'has_warnings' and calculate the mean letterboxd_rating for each group
avg_rating_by_warnings = films.groupby('has_warnings')['letterboxd_rating'].mean().reset_index()
avg_rating_by_warnings

Calculate the Correlation

In [None]:
# Calculate the correlation coefficient between trigger warning and sentiment (rating)
correlation = films['has_warnings'].corr(films['letterboxd_rating'])
print(f"Correlation between trigger warning and sentiment (rating): {correlation}")

#### Violin Plot

A violin plot can show the distribution and density of ratings for movies with and without trigger warnings.

In [None]:
fig = px.violin(films,
                x='has_warnings',
                y='letterboxd_rating',
                title='Sentiment Distribution (Ratings) for Movies with and without Trigger Warnings',
                labels={'letterboxd_rating': 'Letterboxd Rating', 'has_warnings': 'Has Trigger Warning'},
                color='has_warnings',
                color_discrete_map={False: '#F4D6A0', True: '#A8CBB7'},
                box=True,  # Adds a mini box plot inside the violin
                points='all')  # Adds all data points for better visualization

fig.update_layout(
    xaxis_title='Has Trigger Warning (False = No, True = Yes)',
    yaxis_title='Letterboxd Rating (Sentiment)',
    plot_bgcolor='white',
    height=600,
    width=800
)

fig.show()


Histogram

In [None]:
fig = px.histogram(films,
                   x='letterboxd_rating',
                   color='has_warnings',
                   title='Rating Distribution for Movies with and without Trigger Warnings',
                   labels={'letterboxd_rating': 'Letterboxd Rating', 'has_warnings': 'Has Trigger Warning'},
                   barmode='overlay',  # Overlays the histograms
                   nbins=20,  # Number of bins
                   color_discrete_map={False: '#F4D6A0', True: '#A8CBB7'})

fig.update_layout(
    xaxis_title='Letterboxd Rating',
    yaxis_title='Frequency',
    plot_bgcolor='white',
    height=600,
    width=800
)

fig.show()


Line Plot to Visualize Correlations

In [None]:
# Group by has_warnings and calculate the mean ratings
avg_ratings_over_time = films.groupby(['release_year', 'has_warnings'])['letterboxd_rating'].mean().reset_index()

fig = px.line(avg_ratings_over_time,
              x='release_year',
              y='letterboxd_rating',
              color='has_warnings',
              title='Average Rating Trend Over Time (With vs Without Trigger Warnings)',
              labels={'release_year': 'Release Year', 'letterboxd_rating': 'Average Rating', 'has_warnings': 'Has Trigger Warning'},
              color_discrete_map={False: '#F4D6A0', True: '#A8CBB7'})

fig.update_layout(
    xaxis_title='Release Year',
    yaxis_title='Average Letterboxd Rating',
    plot_bgcolor='white',
    height=600,
    width=800
)

fig.show()


In [None]:
tmdb = pd.read_csv('../data/clean/tmdb_clean_films.csv')
tmdb.drop(columns=['tmdb_id', 'imdb_id', 'doesthedog_id', 'original_title', 'director', 'tmdb_votes', 'imdb_votes'], inplace=True)

# tmdb_df.drop(['status'], axis=1, inplace=True)

tmdb.head()

In [None]:
# Generate sentiment polarity scores
tmdb['sentiment_score'] = tmdb['overview'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Check the distribution of sentiment scores for films with and without warnings
fig = px.box(tmdb,
             x='has_warnings',
             y='sentiment_score',
             color='has_warnings',
             title='Sentiment Scores for Films With and Without Warnings',
             labels={'has_warnings': 'Has Trigger Warnings', 'sentiment_score': 'Sentiment Score'},
             color_discrete_map={False: '#F4D6A0', True: '#A8CBB7'})

fig.update_layout(
    xaxis_title='Has Trigger Warnings',
    yaxis_title='Sentiment Score',
    plot_bgcolor='#f7f7f7',
    paper_bgcolor='#f7f7f7',
    height=700,
    width=900,
    showlegend=False
)
fig.show()

# Correlation analysis
correlation = tmdb[['has_warnings', 'sentiment_score']].corr()
print('Correlation between has_warnings and sentiment_score:')
print(correlation)

In [27]:
# # Histogram for profit by warnings
# fig_hist = px.histogram(tmdb,
#                         x='profit',
#                         color='has_warnings',
#                         title='Profit Distribution by Warnings',
#                         labels={'has_warnings': 'Has Warnings', 'profit': 'Profit'},
#                         barmode='overlay',
#                         color_discrete_map={True: '#A8CBB7', False: '#F4D6A0'})

# fig_hist.update_layout(
#     xaxis_title='Profit',
#     yaxis_title='Count',
#     plot_bgcolor='#f7f7f7',
#     paper_bgcolor='#f7f7f7',
#     height=600,
#     width=900
# )

# fig_hist.show()


In [28]:
# # Purple-ish color palette (shades of purple)
# custom_colors = [
#     '#E56B6F', '#EAAC8B', '#355070'
# ]

# random.shuffle(custom_colors)

# fig = make_subplots(rows=2, cols=2, 
#                     subplot_titles=['Popularity vs Warnings', 
#                                     'Popularity vs Profit', 
#                                     'Warnings vs Profit'])

# fig.add_trace(go.Scatter(x=tmdb['popularity'], y=tmdb['has_warnings'], 
#                          mode='markers', name="Popularity vs Warnings", marker=dict(color=custom_colors[0])), row=1, col=1)
# fig.add_trace(go.Scatter(x=tmdb['popularity'], y=tmdb['profit'], 
#                          mode='markers', name="Popularity vs Profit", marker=dict(color=custom_colors[1])), row=1, col=2)
# fig.add_trace(go.Scatter(x=tmdb['has_warnings'], y=tmdb['profit'], 
#                          mode='markers', name="Warnings vs Profit", marker=dict(color=custom_colors[2])), row=2, col=1)

# fig.update_layout(
#     title='Scatter Plot Matrix',
#     height=800,
#     width=800,
#     showlegend=False,
#     plot_bgcolor='#f7f7f7',
#     paper_bgcolor='#f7f7f7'
# )
# fig.show()

# # # save the plot as image
# # fig.write_image('../visuals/scatter_plot_matrix.png')


Correlation Calculation (Popularity and Profit vs Warnings)

In [None]:
popularity_corr = tmdb['popularity'].corr(tmdb['has_warnings'])
profit_corr = tmdb['profit'].corr(tmdb['has_warnings'])

print(f'Correlation between Has Warnings and Popularity: {popularity_corr}')
print(f'Correlation between Has Warnings and Profit: {profit_corr}')

### T-Test for deeper analysis

In [None]:
has_warnings_group = tmdb.groupby('has_warnings')

popularity_with_warnings = has_warnings_group.get_group(1)['popularity']
popularity_without_warnings = has_warnings_group.get_group(0)['popularity']

profit_with_warnings = has_warnings_group.get_group(1)['profit']
profit_without_warnings = has_warnings_group.get_group(0)['profit']

# Perform t-tests
popularity_ttest = ttest_ind(popularity_with_warnings, popularity_without_warnings, equal_var=False)
profit_ttest = ttest_ind(profit_with_warnings, profit_without_warnings, equal_var=False)

# Print the results
print('T-test for Popularity (Has Warnings vs No Warnings):')
print(f't-statistic: {popularity_ttest.statistic}')
print(f'p-value: {popularity_ttest.pvalue}')
print()

print('T-test for Profit (Has Warnings vs No Warnings):')
print(f't-statistic: {profit_ttest.statistic}')
print(f'p-value: {profit_ttest.pvalue}')

In [None]:
# tmdb['has_warnings'] = tmdb['has_warnings'].astype(bool)

# # Create boxplot for Popularity by 'has_warnings' (with/without warnings)
# box_popularity = go.Box(
#     x=tmdb['has_warnings'].map({True: 'Has Warnings', False: 'No Warnings'}),
#     y=tmdb['popularity'],
#     name='Popularity',
#     boxmean='sd',  # Show the mean with standard deviation
#     marker=dict(color='#7D4B9A')
# )

# # Create boxplot for Profit by 'has_warnings' (with/without warnings)
# box_profit = go.Box(
#     x=tmdb['has_warnings'].map({True: 'Has Warnings', False: 'No Warnings'}),
#     y=tmdb['profit'],
#     name='Profit',
#     boxmean='sd',  # Show the mean with standard deviation
#     marker=dict(color='#6A8EAE')
# )

# # Create the layout
# layout = go.Layout(
#     title='Distribution of Popularity and Profit by Warnings',
#     xaxis=dict(title='Has Warnings'),
#     yaxis=dict(title='Value'),
#     plot_bgcolor='#f7f7f7',
#     paper_bgcolor='#f7f7f7',
# )

# fig = go.Figure(data=[box_popularity, box_profit], layout=layout)
# fig.show()

# # Create violin plots for better distribution visualization
# violin_popularity = go.Violin(
#     x=tmdb['has_warnings'].map({True: 'Has Warnings', False: 'No Warnings'}),
#     y=tmdb['popularity'],
#     box_visible=True,
#     line_color='#7D4B9A',
#     name='Popularity'
# )

# violin_profit = go.Violin(
#     x=tmdb['has_warnings'].map({True: 'Has Warnings', False: 'No Warnings'}),
#     y=tmdb['profit'],
#     box_visible=True,
#     line_color='#6A8EAE',
#     name='Profit'
# )

# # Plot the violin plots
# fig = go.Figure(data=[violin_popularity, violin_profit], layout=layout)
# fig.show()

# # Scatter plot to check for outliers and distribution
# scatter_popularity = go.Scatter(
#     x=tmdb['popularity'],
#     y=tmdb['profit'],
#     mode='markers',
#     marker=dict(color=tmdb['has_warnings'].map({True: '#7D4B9A', False: '#6A8EAE'}), opacity=0.6)
# )

# # Plot scatter
# fig = go.Figure(data=[scatter_popularity], layout=layout)
# fig.update_layout(
#     title="Scatter Plot of Popularity vs Profit by Warnings"
# )
# fig.show()


In [None]:
# # Create a new column to represent movies with or without warnings
# tmdb['has_warnings'] = tmdb['has_warnings'].astype(bool)

# # Create histograms for Popularity by 'has_warnings' (with/without warnings)
# hist_popularity = go.Histogram(
#     x=tmdb[tmdb['has_warnings'] == True]['popularity'],
#     opacity=0.75,
#     name='Has Warnings',
#     marker=dict(color='#7D4B9A'),
#     bingroup=1,
#     histnorm='probability'
# )

# hist_popularity_no_warnings = go.Histogram(
#     x=tmdb[tmdb['has_warnings'] == False]['popularity'],
#     opacity=0.75,
#     name='No Warnings',
#     marker=dict(color='#6A8EAE'),
#     bingroup=1,
#     histnorm='probability'
# )

# # Create histograms for Profit by 'has_warnings' (with/without warnings)
# hist_profit = go.Histogram(
#     x=tmdb[tmdb['has_warnings'] == True]['profit'],
#     opacity=0.75,
#     name='Has Warnings',
#     marker=dict(color='#7D4B9A'),
#     bingroup=1,
#     histnorm='probability'
# )

# hist_profit_no_warnings = go.Histogram(
#     x=tmdb[tmdb['has_warnings'] == False]['profit'],
#     opacity=0.75,
#     name='No Warnings',
#     marker=dict(color='#6A8EAE'),
#     bingroup=1,
#     histnorm='probability'
# )

# # Create layout for histograms
# layout = go.Layout(
#     title='Histogram of Popularity and Profit by Warnings',
#     barmode='overlay',
#     xaxis=dict(title='Value'),
#     yaxis=dict(title='Probability'),
#     plot_bgcolor='#f7f7f7',
#     paper_bgcolor='#f7f7f7',
# )

# # Plot histograms
# fig = go.Figure(data=[hist_popularity, hist_popularity_no_warnings, hist_profit, hist_profit_no_warnings], layout=layout)
# fig.show()


### Popularity and Profit Over Time by Warnings

Palette

'#6E7B7A', '#99A7A4', '#C4D8C1', '#3E4A49', '#B8C6D0',
'#FFC857', '#FFE156', '#71A9F7', '#FF7F51', '#B5838D',
'#6B705C', '#DDBEA9', '#A5A58D', '#CCD5AE', '#E63946'

In [None]:
line_data_popularity = tmdb.groupby(['release_year', 'has_warnings'])['popularity'].mean().reset_index()
line_data_profit = tmdb.groupby(['release_year', 'has_warnings'])['profit'].mean().reset_index()

line_popularity = px.line(line_data_popularity, 
                          x='release_year', 
                          y='popularity', 
                          color='has_warnings',
                          title='Popularity Over Time by Warnings',
                          labels={'popularity': 'Popularity', 'release_year': 'Release Year'},
                          color_discrete_map={True: '#CCD5AE', False: '#DDBEA9'})

line_profit = px.line(line_data_profit, 
                      x='release_year', 
                      y='profit', 
                      color='has_warnings',
                      title='Profit Over Time by Warnings',
                      labels={'profit': 'Profit', 'release_year': 'Release Year'},
                      color_discrete_map={True: '#CCD5AE', False: '#DDBEA9'})

# layout
line_popularity.update_layout(
    plot_bgcolor='#f7f7f7', 
    paper_bgcolor='#f7f7f7', 
    legend_title='Content Warning',
    width=900
)

line_profit.update_layout(
    plot_bgcolor='#f7f7f7', 
    paper_bgcolor='#f7f7f7', 
    legend_title='Content Warning',
    width=900
)

# line_popularity.write_image('../visuals/line_popularity_by_warnings.svg')
# line_profit.write_image('../visuals/line_profit_by_warnings.svg')

line_popularity.show()
line_profit.show()

#### Calculate Correlation between Warnings and Popularity / Profit

In [None]:
correlation_popularity = tmdb[['has_warnings', 'popularity']].corr()
correlation_profit = tmdb[['has_warnings', 'profit']].corr()

print('Correlation between Has Warnings and Popularity:')
print(correlation_popularity)

print('Correlation between Has Warnings and Profit:')
print(correlation_profit)

#### Word Cloud of Events

In [None]:
# # Split the events into individual events and count their occurrences
# event_list = cleaned_films['events'].str.split(', ').explode().tolist()
# event_counts = Counter(event_list)

# # Create a new DataFrame from the event counts
# event_df = pd.DataFrame(event_counts.items(), columns=['Event', 'Count'])

# palette = [
#     '#6E7B7A', '#99A7A4', '#C4D8C1', '#3E4A49', '#B8C6D0',
#     '#FFC857', '#FFE156', '#71A9F7', '#FF7F51', '#B5838D',
#     '#6B705C', '#DDBEA9', '#A5A58D', '#CCD5AE', '#E63946',
#     '#A8DADC', '#457B9D', '#1D3557', '#F4A261', '#E76F51'
# ]

# # Define a color function using the palette
# def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
#     return palette[(hash(word) % len(palette))]

# # Generate a word cloud with better word spacing
# wordcloud = WordCloud(
#     width=800,
#     height=400,
#     background_color='#f7f7f7',  # Set the background color
#     color_func=color_func,
#     max_font_size=100,  # Increase max font size for more differentiation
#     collocations=False,  # Avoid combining collocations
#     prefer_horizontal=0.1,  # Allows more vertical word placement, reducing crowding
#     relative_scaling=0.5,  # Controls the scaling of words' size relative to their frequencies
#     contour_width=0  # Removes the contour for a cleaner look
# ).generate_from_frequencies(event_counts)

# # Plot the word cloud with a consistent background color
# plt.figure(figsize=(10, 5), facecolor='#f7f7f7')  # Match the figure background color
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.title('Word Cloud of Events', fontsize=16)
# plt.show()