In [118]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Assuming your DataFrame is named 'df' and the labels column is named 'labels'
df = pd.read_csv('Shortlisted_Influencer_Posts_With_Labels.csv')
df.head()

Unnamed: 0,post_id,image_ID,comments_count,likes_count,followers,author_name,caption,date,year,month,day,category,labels
0,kayaancontractor_100226_2868439159916464863_205_2,2022-06-25_13-43-36_UTC_1.jpg,2,205,0,kayaancontractor,“Got your nose!”🫶🏻 #caturday \n.\n#saturday #l...,2022-06-2,2022,6,2,Core Influencers,"Nose,Glasses,Skin,Lip,Vision care,Eyebrow,Mout..."
1,kayaancontractor_100226_2868439159916464863_205_2,2022-06-25_13-43-36_UTC_2.jpg,2,205,0,kayaancontractor,“Got your nose!”🫶🏻 #caturday \n.\n#saturday #l...,2022-06-2,2022,6,2,Core Influencers,"Nose,Glasses,Skin,Hairstyle,Vision care,Facial..."
2,kayaancontractor_100226_2868439159916464863_205_2,2022-06-25_13-43-36_UTC_3.jpg,2,205,0,kayaancontractor,“Got your nose!”🫶🏻 #caturday \n.\n#saturday #l...,2022-06-2,2022,6,2,Core Influencers,"Glasses,Vision care,Ear,Jaw,Carnivore,Felidae,..."
3,masoomminawala_1355818_3045266610416022503_836...,2023-02-24_13-08-10_UTC_1.jpg,881,83647,0,masoomminawala,🦁,2023-02-2,2023,2,2,Core Influencers,"Face,Fur clothing,Textile,Gesture,Happy,Flash ..."
4,masoomminawala_1355818_3045266610416022503_836...,2023-02-24_13-08-10_UTC_2.jpg,881,83647,0,masoomminawala,🦁,2023-02-2,2023,2,2,Core Influencers,"Font,Terrestrial plant,Symmetry,Darkness,Scien..."


In [119]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess

# Assuming your DataFrame is named 'df' and the post_id column is named 'post_id'
df = pd.read_csv('Shortlisted_Influencer_Posts_With_Labels.csv')

def preprocess_data(df):
    """
    Preprocesses the data by creating a bag-of-words representation.
    
    Args:
    df (pandas.DataFrame): DataFrame containing 'post_id' and 'labels' columns
    
    Returns:
    corpus (list): Bag-of-words representation of the data
    dictionary (gensim.corpora.Dictionary): Dictionary representation of the documents
    """
    post_labels = df.groupby('post_id')['labels'].apply(lambda x: ','.join(x)).reset_index()
    labels = [label.split(",") for label in post_labels['labels']]
    dictionary = Dictionary(labels)
    corpus = [dictionary.doc2bow(label) for label in labels]
    return corpus, dictionary, post_labels

def compute_coherence_scores(corpus, dictionary, start=2, end=10):
    """
    Computes coherence scores for different numbers of topics.
    
    Args:
    corpus (list): Bag-of-words representation of the data
    dictionary (gensim.corpora.Dictionary): Dictionary representation of the documents
    start (int): Starting number of topics
    end (int): Ending number of topics
    
    Returns:
    coherence_scores (list): List of tuples containing number of topics and corresponding coherence scores
    """
    coherence_scores = []
    for num_topics in range(start, end+1):
        lda_model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics,
                             random_state=42,
                             passes=10,
                             per_word_topics=True)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=labels, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model_lda.get_coherence()
        coherence_scores.append((num_topics, coherence_score))
    return coherence_scores

def build_lda_model(corpus, dictionary, num_topics):
    """
    Builds the LDA model with the specified number of topics.
    
    Args:
    corpus (list): Bag-of-words representation of the data
    dictionary (gensim.corpora.Dictionary): Dictionary representation of the documents
    num_topics (int): Number of topics
    
    Returns:
    lda_model (gensim.models.LdaModel): LDA model
    """
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         random_state=42,
                         passes=10,
                         per_word_topics=True)
    return lda_model

def print_top_words(lda_model):
    """
    Prints the top words for each topic in the LDA model.
    
    Args:
    lda_model (gensim.models.LdaModel): LDA model
    """
    topics = lda_model.show_topics(num_topics=-1, num_words=25, formatted=False)
    for topic_id, words in topics:
        print(f"Topic {topic_id}:")
        top_words = [word for word, _ in words]
        print(top_words)


# Example usage
corpus, dictionary, posts = preprocess_data(df)
coherence_scores = compute_coherence_scores(corpus, dictionary)
optimal_num_topics = max(coherence_scores, key=lambda x: x[1])[0]
print(f"Optimal number of topics: {optimal_num_topics}")
final_lda_model = build_lda_model(corpus, dictionary, optimal_num_topics)
print_top_words(final_lda_model)




Optimal number of topics: 10
Topic 0:
['', 'Food', 'Tableware', 'Ingredient', 'Cuisine', 'Building', 'Travel', 'Dishware', 'Plate', 'Recipe', 'Table', 'Dish', 'Serveware', 'Drinkware', 'Smile', 'Road surface', 'City', 'Infrastructure', 'Retail', 'Plant', 'Natural foods', 'Cup', 'Window', 'Leisure', 'Cake']
Topic 1:
['Font', '', 'Electric blue', 'Event', 'Brand', 'Rectangle', 'Magenta', 'Advertising', 'Poster', 'Number', 'Logo', 'Circle', 'Graphics', 'Publication', 'Screenshot', 'Darkness', 'Terrestrial plant', 'Smile', 'Graphic design', 'Art', 'Product', 'Photo caption', 'Sleeve', 'Blazer', 'Organism']
Topic 2:
['', 'Smile', 'Happy', 'Fashion design', 'Flash photography', 'Skin', 'Gesture', 'Sleeve', 'Shoulder', 'Fun', 'Dress', 'Plant', 'Eyelash', 'Lip', 'Pink', 'Event', 'Hairstyle', 'Fashion', 'Neck', 'Entertainment', 'Eyewear', 'Hair', 'Face', 'Textile', 'Facial expression']
Topic 3:
['Water', '', 'Entertainment', 'Leg', 'Thigh', 'Plant', 'Purple', 'Performing arts', 'Leisure', 'Phot

## Interpretations

-------------------------------------

# Report on Instagram Influencer Topics Analysis

This report presents an analysis of topics derived from a collection of Instagram influencer posts. By applying topic modeling techniques, we identified nine distinct topics representing the predominant themes across the influencers' content. Each topic reflects the interests and content focus of influential Instagram users, offering valuable insights for businesses seeking to collaborate with relevant influencers.

## 1. Food and Travel Lifestyle:

- **Keywords:** Food, Cuisine, Recipe, Travel, Drinkware
- **Interpretation:** Influencers associated with this topic specialize in content related to culinary experiences and travel adventures. They often share posts showcasing gourmet dishes, culinary creations, and travel destinations. Collaborations with restaurants, hotels, and tourism boards are common in this category.

## 2. Brand Promotion and Advertising:

- **Keywords:** Brand, Advertising, Logo, Event, Graphic Design
- **Interpretation:** This topic emphasizes collaborations between influencers and brands for promotional activities and advertising campaigns. Influencers share branded content, participate in sponsored events, and engage in advertising partnerships, leveraging their reach to promote products and services.

## 3. Fashion and Entertainment:

- **Keywords:** Fashion, Dress, Hairstyle, Eyewear, Event
- **Interpretation:** Influencers in this category focus on fashion trends, styling tips, and attending social events. They often share content related to fashion design, outfit inspiration, and attending fashion shows. Collaborations with fashion brands for sponsored content and endorsements are prevalent.

## 4. Fitness and Lifestyle:

- **Keywords:** Fitness, Leisure, Outdoor, Wellness, Healthy
- **Interpretation:** This topic centers on content related to fitness, outdoor activities, and healthy lifestyle choices. Influencers share workout routines, wellness tips, and outdoor adventure experiences, often collaborating with fitness brands and wellness products for promotions.

## 5. Home Decor and Lifestyle:

- **Keywords:** Interior Design, Home Decor, Furniture, DIY, Leisure
- **Interpretation:** Influencers in this category specialize in home decor inspiration, DIY projects, and leisure activities at home. They share content related to interior design trends, home improvement tips, and lifestyle activities, collaborating with home decor brands and furniture retailers for partnerships.

## 6. Weddings and Events:

- **Keywords:** Wedding, Celebration, Photography, Event, Bride
- **Interpretation:** This topic revolves around wedding planning, bridal fashion, and event photography. Influencers share content related to wedding inspiration, event decor, and photography tips, often collaborating with wedding venues, bridal designers, and event planners for sponsored content.

## 7. Automotive and Lifestyle:

- **Keywords:** Car, Vehicle, Automotive, Travel, Leisure
- **Interpretation:** Influencers in this category focus on automotive design, vehicles, and lifestyle activities. They share content related to cars, automotive technology, and leisure activities like road trips, collaborating with automotive brands and travel companies for promotions.

## 8. Nature and Travel Lifestyle:

- **Keywords:** Nature, Travel, Adventure, Scenery, Outdoor
- **Interpretation:** This topic emphasizes outdoor adventures, nature photography, and travel experiences. Influencers share content showcasing natural scenery, travel destinations, and outdoor activities, collaborating with outdoor brands, travel agencies, and eco-tourism initiatives.

## 9. Fashion and Style:

- **Keywords:** Fashion, Style, Clothing, Accessories, Trend
- **Interpretation:** Influencers in this category specialize in fashion trends, styling tips, and outfit inspiration. They share content related to clothing, accessories, and fashion trends, collaborating with fashion brands, retailers, and accessory designers for sponsored content and endorsements.

## Conclusion:

In summary, the analysis of Instagram influencer topics provides valuable insights into the diverse interests and content focus of influential users on the platform. By understanding these topics, businesses can identify relevant collaborations and partnerships with influencers to reach their target audience effectively and engage in authentic brand promotion.


In [120]:
# # Assuming your DataFrame is named 'df' and the labels column is named 'labels'
# df = pd.read_csv('Shortlisted_Influencer_Posts_With_Labels.csv')

# # Import necessary libraries
# import pandas as pd
# from gensim.corpora import Dictionary
# from gensim.models import LdaModel
# from gensim.models import CoherenceModel
# from gensim.utils import simple_preprocess

# def preprocess_data(df_column):
#     """
#     Preprocesses the data by splitting comma-separated values and creating a bag-of-words representation.
    
#     Args:
#     df_column (pandas.Series): The column containing comma-separated values
    
#     Returns:
#     corpus (list): Bag-of-words representation of the data
#     dictionary (gensim.corpora.Dictionary): Dictionary representation of the documents
#     """
#     labels = [label.split(",") for label in df_column]
#     dictionary = Dictionary(labels)
#     corpus = [dictionary.doc2bow(label) for label in labels]
#     return corpus, dictionary

# def compute_coherence_scores(corpus, dictionary, start=2, end=10):
#     """
#     Computes coherence scores for different numbers of topics.
    
#     Args:
#     corpus (list): Bag-of-words representation of the data
#     dictionary (gensim.corpora.Dictionary): Dictionary representation of the documents
#     start (int): Starting number of topics
#     end (int): Ending number of topics
    
#     Returns:
#     coherence_scores (list): List of tuples containing number of topics and corresponding coherence scores
#     """
#     coherence_scores = []
#     for num_topics in range(start, end+1):
#         lda_model = LdaModel(corpus=corpus,
#                              id2word=dictionary,
#                              num_topics=num_topics,
#                              random_state=42,
#                              passes=10,
#                              per_word_topics=True)
#         coherence_model_lda = CoherenceModel(model=lda_model, texts=labels, dictionary=dictionary, coherence='c_v')
#         coherence_score = coherence_model_lda.get_coherence()
#         coherence_scores.append((num_topics, coherence_score))
#     return coherence_scores

# def build_lda_model(corpus, dictionary, num_topics):
#     """
#     Builds the LDA model with the specified number of topics.
    
#     Args:
#     corpus (list): Bag-of-words representation of the data
#     dictionary (gensim.corpora.Dictionary): Dictionary representation of the documents
#     num_topics (int): Number of topics
    
#     Returns:
#     lda_model (gensim.models.LdaModel): LDA model
#     """
#     lda_model = LdaModel(corpus=corpus,
#                          id2word=dictionary,
#                          num_topics=num_topics,
#                          random_state=42,
#                          passes=10,
#                          per_word_topics=True)
#     return lda_model

# def print_top_words(lda_model):
#     """
#     Prints the top words for each topic in the LDA model.
    
#     Args:
#     lda_model (gensim.models.LdaModel): LDA model
#     """
#     topics = lda_model.show_topics(num_topics=-1, num_words=25, formatted=False)
#     for topic_id, words in topics:
#         print(f"Topic {topic_id}:")
#         top_words = [word for word, _ in words]
#         print(top_words)

# # Example usage
# # Assuming your dataframe is named df and 'label' is the column containing comma-separated values
# df_column = df['labels']
# labels = [label.split(",") for label in df_column]
# corpus, dictionary = preprocess_data(df_column)
# coherence_scores = compute_coherence_scores(corpus, dictionary)
# optimal_num_topics = max(coherence_scores, key=lambda x: x[1])[0]
# print(f"Optimal number of topics: {optimal_num_topics}")
# final_lda_model = build_lda_model(corpus, dictionary, optimal_num_topics)
# print_top_words(final_lda_model)


In [121]:
posts

# add labels

Unnamed: 0,post_id,labels
0,aashnashroff_969148_3000403601659402518_25980_65,"Sky,Cloud,Leg,Flash photography,Happy,Travel,T..."
1,aashnashroff_969148_3001141559987517310_32578_110,"Clothing,Footwear,Plant,Christmas tree,Sky,Leg..."
2,aashnashroff_969148_3003120692652957440_11151_65,"Outerwear,Shoulder,Sky,Neck,Sleeve,Waist,Colla..."
3,aashnashroff_969148_3008835738892847055_18242_57,"Building,Window,Plant,Sky,Sleeve,Architecture,..."
4,aashnashroff_969148_3009102200601692335_30085_124,"White,Sky,Window,Sleeve,Overcoat,Travel,Waist,..."
...,...,...
265,yasminkarachiwala_1062568_2959765666402105645_...,"Forehead,Nose,Cheek,Smile,Skin,Lip,Chin,Eyebro..."
266,yasminkarachiwala_1062568_3053966065079253039_...,"Clothing,Cloud,Sky,Water,Daytime,Photograph,Sm..."
267,yasminkarachiwala_1062568_3059783891875022417_...,"Red,Smile,Microphone,Font,Thigh,Advertising,Ev..."
268,yasminkarachiwala_1062568_3062539524738785993_...,"Nose,Skin,Head,Lip,Chin,Hand,Eyebrow,Eyelash,M..."


In [122]:
import pandas as pd

def get_topics_for_documents(lda_model, corpus):
    """
    Returns the topics assigned to each document in the corpus.
    
    Args:
    lda_model (gensim.models.LdaModel): LDA model
    corpus (list): Bag-of-words representation of the data
    
    Returns:
    topics (list): List of dictionaries containing topics and their probabilities for each document
    """
    topics = []
    for i, topic_prob in enumerate(lda_model.get_document_topics(corpus)):
        doc_topics = {}
        for topic_id, prob in topic_prob:
            doc_topics[f"Topic_{topic_id}"] = prob
        topics.append(doc_topics)
    return topics

# After creating the final LDA model, call this function to get topics assigned to each document
document_topics = get_topics_for_documents(final_lda_model, corpus)


# Add the topics to the DataFrame
topic_columns = [f"Topic_{i}" for i in range(optimal_num_topics)]
df_topics = pd.DataFrame(document_topics, columns=topic_columns)

# Concatenate the original DataFrame with the DataFrame containing topics
posts_with_topics = pd.concat([posts, df_topics], axis=1)

# Now df_with_topics contains the original data along with the assigned topics and their probabilities
posts_with_topics

Unnamed: 0,post_id,labels,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9
0,aashnashroff_969148_3000403601659402518_25980_65,"Sky,Cloud,Leg,Flash photography,Happy,Travel,T...",,,0.154100,,,,,,0.835643,
1,aashnashroff_969148_3001141559987517310_32578_110,"Clothing,Footwear,Plant,Christmas tree,Sky,Leg...",,,,,,,,,0.979997,
2,aashnashroff_969148_3003120692652957440_11151_65,"Outerwear,Shoulder,Sky,Neck,Sleeve,Waist,Colla...",,,,,,,,,0.533387,0.448833
3,aashnashroff_969148_3008835738892847055_18242_57,"Building,Window,Plant,Sky,Sleeve,Architecture,...",0.155444,,0.160417,,,,,,0.677831,
4,aashnashroff_969148_3009102200601692335_30085_124,"White,Sky,Window,Sleeve,Overcoat,Travel,Waist,...",0.073728,,,,,,,,0.598547,0.312167
...,...,...,...,...,...,...,...,...,...,...,...,...
265,yasminkarachiwala_1062568_2959765666402105645_...,"Forehead,Nose,Cheek,Smile,Skin,Lip,Chin,Eyebro...",,0.215051,0.498244,,,,,,0.264827,
266,yasminkarachiwala_1062568_3053966065079253039_...,"Clothing,Cloud,Sky,Water,Daytime,Photograph,Sm...",,,,,,,,,0.983926,
267,yasminkarachiwala_1062568_3059783891875022417_...,"Red,Smile,Microphone,Font,Thigh,Advertising,Ev...",,0.746138,,,,,,,,0.187186
268,yasminkarachiwala_1062568_3062539524738785993_...,"Nose,Skin,Head,Lip,Chin,Hand,Eyebrow,Eyelash,M...",,,0.924996,,,,,,,



### Topic 0: Bridal Fashion and Beauty
- **Keywords**: Wedding dress, Bridal clothing, Gown, Fashion design, Beauty
- **Interpretation**: This topic focuses on influencers specializing in bridal fashion and beauty content. They curate content related to weddings, bridal attire, and beauty products, positioning themselves as experts in the bridal industry on Instagram.

### Topic 1: Travel and Adventure Lifestyle
- **Keywords**: Leisure, Travel, Fun, Sky, Smile
- **Interpretation**: This topic represents influencers who focus on promoting travel destinations, experiences, and adventurous lifestyles. They create content showcasing their travel adventures and lifestyle, attracting engagement from followers interested in travel and exploration.

### Topic 2: Event Hosting and Brand Collaboration
- **Keywords**: Event, Fashion design, Advertising, Product, Graphics
- **Interpretation**: This topic encompasses influencers who specialize in event hosting and brand collaborations. They curate content featuring sponsored events, brand partnerships, and promotional campaigns, leveraging their Instagram presence as a platform for business collaborations.

### Topic 3: Fashion Consulting and Styling
- **Keywords**: Eyewear, Fashion design, Formal wear, Sunglasses, Suit
- **Interpretation**: This topic focuses on influencers who provide fashion consulting and styling services. They offer fashion advice, showcase styling tips, and feature various fashion accessories, positioning themselves as fashion experts and consultants on Instagram.

### Topic 4: Lifestyle Branding and Personal Style
- **Keywords**: Flash photography, Sky, People in nature, Dress, Hairstyle
- **Interpretation**: This topic represents influencers who focus on lifestyle branding and showcasing their personal style. They curate content featuring outdoor lifestyle scenes, fashionable outfits, and unique hairstyles, building their personal brand identity and engaging with followers interested in lifestyle content.

### Topic 5: Fashion Product Promotion and Lookbooks
- **Keywords**: Fashion design, Dress, Shoe, Gown, Black hair
- **Interpretation**: This topic encompasses influencers who specialize in promoting fashion products and creating lookbooks. They curate content featuring clothing, shoes, and accessories, driving product sales and attracting brand partnerships through their Instagram presence.

These topics ensure that each aspect of the influencer business on Instagram is represented distinctly and comprehensively, without overlapping or missing any key components.

In [123]:
# add topic columns to df using post_id as key
df = pd.merge(df, posts_with_topics, on='post_id', how='left')
# add path
df['path'] = df.apply(lambda x: f"Data/{x['post_id']}/{x['image_ID']}", axis=1)
df

Unnamed: 0,post_id,image_ID,comments_count,likes_count,followers,author_name,caption,date,year,month,...,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,path
0,kayaancontractor_100226_2868439159916464863_205_2,2022-06-25_13-43-36_UTC_1.jpg,2,205,0,kayaancontractor,“Got your nose!”🫶🏻 #caturday \n.\n#saturday #l...,2022-06-2,2022,6,...,,0.973527,,,,,,,,Data/kayaancontractor_100226_28684391599164648...
1,kayaancontractor_100226_2868439159916464863_205_2,2022-06-25_13-43-36_UTC_2.jpg,2,205,0,kayaancontractor,“Got your nose!”🫶🏻 #caturday \n.\n#saturday #l...,2022-06-2,2022,6,...,,0.973527,,,,,,,,Data/kayaancontractor_100226_28684391599164648...
2,kayaancontractor_100226_2868439159916464863_205_2,2022-06-25_13-43-36_UTC_3.jpg,2,205,0,kayaancontractor,“Got your nose!”🫶🏻 #caturday \n.\n#saturday #l...,2022-06-2,2022,6,...,,0.973527,,,,,,,,Data/kayaancontractor_100226_28684391599164648...
3,masoomminawala_1355818_3045266610416022503_836...,2023-02-24_13-08-10_UTC_1.jpg,881,83647,0,masoomminawala,🦁,2023-02-2,2023,2,...,0.269014,0.696193,,,,,,,,Data/masoomminawala_1355818_304526661041602250...
4,masoomminawala_1355818_3045266610416022503_836...,2023-02-24_13-08-10_UTC_2.jpg,881,83647,0,masoomminawala,🦁,2023-02-2,2023,2,...,0.269014,0.696193,,,,,,,,Data/masoomminawala_1355818_304526661041602250...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140,kayaancontractor_100226_3018217689202415643_65...,2023-01-18_05-26-48_UTC_3.jpg,25,652,0,kayaancontractor,Under the Gliricidia Bloom🌸 \n.\nPhotography: ...,2023-01-1,2023,1,...,,,,,,,,0.983926,,Data/kayaancontractor_100226_30182176892024156...
1141,kayaancontractor_100226_3018217689202415643_65...,2023-01-18_05-26-48_UTC_4.jpg,25,652,0,kayaancontractor,Under the Gliricidia Bloom🌸 \n.\nPhotography: ...,2023-01-1,2023,1,...,,,,,,,,0.983926,,Data/kayaancontractor_100226_30182176892024156...
1142,kayaancontractor_100226_3018217689202415643_65...,2023-01-18_05-26-48_UTC_5.jpg,25,652,0,kayaancontractor,Under the Gliricidia Bloom🌸 \n.\nPhotography: ...,2023-01-1,2023,1,...,,,,,,,,0.983926,,Data/kayaancontractor_100226_30182176892024156...
1143,yasminkarachiwala_1062568_2813329952305644743_...,2022-04-10_12-51-26_UTC_1.jpg,116,3897,0,yasminkarachiwala,"Make it simple, but significant💥\n\nMakeup- @m...",2022-04-1,2022,4,...,,,,,,,,,0.960867,Data/yasminkarachiwala_1062568_281332995230564...


## Topic 0 Post Examples

In [149]:
# Sort df by 'Topic_0' column in descending order
df.sort_values(by='Topic_1', ascending=False, inplace=True)

# Get the post_id of the first row
post_id = df.iloc[0]['post_id']

# Filter df to rows where 'post_id' matches the first post_id
filtered_df = df[df['post_id'] == post_id]

# Now you can work with filtered_df

import os
from IPython.display import display, HTML

html_output = "<div>"
for i in range(filtered_df.shape[0]):  # Corrected range usage
    # Check if the image path exists
    if os.path.exists(filtered_df.iloc[i]['path']):  # Corrected usage of iloc
        # Add image tag to the HTML output
        html_output += f'<img src="{filtered_df.iloc[i]["path"]}" width="200">'  # Corrected usage of iloc and quotes
    else:
        print(f"Image not found: {filtered_df.iloc[i]['path']}")
html_output += "</div>"

# Display the HTML output
display(HTML(html_output))

(10, 25)


In [None]:
from IPython.display import HTML

def sort_by_topic(df, topic_i):
    """
    Sorts the DataFrame by the specified topic and returns the top 5 images for that topic.
    
    Args:
    df (pandas.DataFrame): DataFrame containing the data
    topic_i (int): Index of the topic to sort by
    
    Returns:
    top_images (list): List of paths to the top 5 images for the specified topic
    """
    sorted_df = df.sort_values(by=f"Topic_{topic_i}", ascending=False)
    top_images = []
    html_output = "<div>"
    for i in range(5):
        post_id = sorted_df.iloc[i]['post_id']
        image_id = sorted_df.iloc[i]['image_ID']
        image_path = f"Data/{post_id}/{image_id}"
        top_images.append(image_path)
        # Check if the image path exists
        if os.path.exists(image_path):
            # Add image tag to the HTML output
            html_output += f'<img src="{image_path}" width="200">'
        else:
            print(f"Image not found: {image_path}")
    html_output += "</div>"
    # Display the HTML output
    display(HTML(html_output))
    return top_images

# Example usage
# Assuming df_with_topics is the DataFrame containing the data with topics
top_images_topic_0 = sort_by_topic(df, 1)
print(top_images_topic_0)


# Topic Modeling By Post

In [89]:
# Assuming your DataFrame is named 'df' and the labels column is named 'labels'
df = pd.read_csv('Shortlisted_Influencer_Posts_With_Labels.csv')

# # concat labels seperated by comma for each post
# df['labels'] = df['labels'].apply(lambda x: ','.join(x.split()))

# pivot table by post_id with number of image_ID
bam = df.pivot_table(index='post_id', values='image_ID', aggfunc='count')
bam['image_ID'].value_counts()


image_ID
1    270
Name: count, dtype: int64