In [1]:
import pandas as pd
import numpy as np
from wordhoard import Synonyms
from collections import Counter
import string
import re

In [2]:
class Social:
      

    def __init__(self,path):
         '''
         initialize instance of platform
         '''
         assert(isinstance(path,str)), "invalid file path "
         self.read_to_dataframe(path)
         self.preprocess_frame()


    def get_categories(self):
        '''
        Returns the categories in a dataframe
        '''
        return self.categories

    def preprocess_frame(self):
        '''
        clean up dataframe
        '''
        self.metrics=[]
        #remove nan
        self.df.dropna(0, inplace=True)

        #get attributes in csv
        headings= self.df.head(0)
        headings=(list(headings.columns))
        data=list(self.df.iloc[1,:])


        #convert numeric suffixes to numeric
        for i,dat in enumerate(data):
            if(isinstance(dat,str) and dat[0].isnumeric()):
                if(dat.lower().endswith('m') or dat.lower().endswith('b') or dat.lower().endswith('k') ):
                    self.df[headings[i]]=self.df[headings[i]].apply(self.value_to_float)
                    self.metrics.append(headings[i])
        
        #drop duplicate columns
        self.df.drop_duplicates(subset=[headings[1],headings[2]],inplace=True)

        self.categories=[heading.strip() for heading in headings]
        self.name_head=headings[2]

       
    def get_influencer_fromdf(self,df):
        '''
        pass a dataframe and extract influencer names
        '''
        return list(df[self.name_head])


    def read_to_dataframe(self,path):
        '''
        Reads a csv into a dataframe.
        param: 
        path (type: string) : file name 
        '''
        self.df=pd.read_csv(path)

    def get_category_items(self, category):
        '''
        gets items pertaining to a category
        param: 
        path (type: string) : file name 
        
        output:
        df (type: pd.dataframe) : output data frame column
        '''

        assert(category in self.categories),"inavlid category"
        assert(isinstance(self.df,pd.DataFrame)),"inavlid dataframe"

        #remove nans
        self.df[category] = self.df[category].replace(np.nan, 'other')

        #get entries in category
        subcategories=list(self.df[category])

        #get subcategories
        subcategories=Counter(subcategories)

        return list(subcategories.keys())
    
    # def process_subcategories(self, subcategories):
    
    #     assert(isinstance(subcategories,list)),"inavlid sub-category list"
    #     synonyms_dict = {}
    #     for word in subcategories:
    #         synonym = Synonyms(word)
    #         synonyms_results = synonym.find_synonyms()
    #         if synonyms_results is not None:
    #             synonyms_dict[word] = synonyms_results
    #         else:
    #             synonyms_dict[word] = 'no synonyms found'


    #     return synonyms_dict
    
    
    def get_subcategory_items(self, df, category, subcategory):
        '''
        gets items pertaining to a sub-category 
        param: 
        category (type: string) : main category
        subcategory (type: string) : subcategory under the specific category
        
        output:
        items (type: pd.dataframe) : output data frame column filtered by subcategory
        '''

        return df[df[category].str.contains(subcategory)] 
    
    def find_topn_influencers(self, dataframe,N):
        '''
        returns dataframe pertaining to top N influencers
        param: 
        dataframe (type: pd.DataFrame) : dataframe
        N (type: int) : number of influencer data needed
        
        output:
        dictionary of dataframe (type: pd.DataFrame): sorted top N influencer data
        '''
        top={}
        #return top N influencers based on each metric 
        for metric in self.metrics:
            df=dataframe.sort_values(by=[metric],ascending=False)
            top[metric]=df.head(N)
            
        return top


    #helper functions
    def value_to_float(self,x):
        if type(x) == float or type(x) == int:
            return x
        if 'K' in x:
            if len(x) > 1:
                return float(x.replace('K', '')) * 1000
            return 1000.0
        if 'M' in x:
            if len(x) > 1:
                return float(x.replace('M', '')) * 1000000
            return 1000000.0
        if 'B' in x:
            return float(x.replace('B', '')) * 1000000000
        return 0.0

In [3]:
instagram= Social("../data/Instagram/social media influencers - instagram sep-2022.csv")
print(instagram.categories)
instagram.df.head(5)

['S.no', 'Instagram name', 'Name', 'Subscribers', 'Audience country', 'Authentic engagement', 'Engagement average', 'Category_1', 'Category_2']


  self.df.dropna(0, inplace=True)


Unnamed: 0,S.no,Instagram name,Name,Subscribers,Audience country,Authentic engagement\n,Engagement average\r\n,Category_1,Category_2
1,2,kyliejenner,Kylie 🤍,368100000.0,United States,3500000.0,5500000.0,Fashion,Modeling
3,4,leomessi,Leo Messi,358600000.0,Indonesia,2700000.0,3500000.0,Sports with a ball,Family
4,5,zendaya,Zendaya,151100000.0,United States,4300000.0,5800000.0,Cinema & Actors/actresses,Fashion
10,11,kimkardashian,Kim Kardashian,329800000.0,United States,1200000.0,1600000.0,Fashion,Beauty
20,21,kendalljenner,Kendall,255400000.0,United States,852000.0,1300000.0,Modeling,Fashion


In [4]:
#visualize different categories of content
subcategories=instagram.get_category_items("Category_1")
print(subcategories)
#print(instagram.process_subcategories(subcategories))
subcategories=instagram.get_category_items("Audience country")
print(subcategories)

['Fashion', 'Sports with a ball', 'Cinema & Actors/actresses', 'Modeling', 'Finance & Economics', 'Clothing & Outfits', 'Lifestyle', 'Science', 'Beauty', 'Shows', 'Family', 'Fitness & Gym', 'Humor & Fun & Happiness', 'Nature & landscapes', 'Literature & Journalism', 'Computers & Gadgets', 'Photography', 'Education', 'Luxury', 'Winter sports', 'Business & Careers', 'Racing Sports', 'Cars & Motorbikes']
['United States', 'Indonesia', 'India', 'Brazil', 'Mexico', 'Argentina', 'Russia', 'Colombia', 'Turkey', 'Iran', 'Philippines', 'Italy', 'Egypt', 'South Korea', 'United Kingdom', 'Japan', 'Spain', 'Pakistan', 'Thailand', 'Romania', 'Germany', 'France']


In [5]:
#get dataframes of a particular subcategory
instagram.get_subcategory_items( instagram.df,'Category_1','Fashion').head(3)

Unnamed: 0,S.no,Instagram name,Name,Subscribers,Audience country,Authentic engagement\n,Engagement average\r\n,Category_1,Category_2
1,2,kyliejenner,Kylie 🤍,368100000.0,United States,3500000.0,5500000.0,Fashion,Modeling
10,11,kimkardashian,Kim Kardashian,329800000.0,United States,1200000.0,1600000.0,Fashion,Beauty
647,648,taylor_hill,Taylor Hill,20500000.0,United States,121000.0,180500.0,Fashion,Modeling


In [6]:
#get dataframes of a particular subcategory
instagram.get_subcategory_items( instagram.df,'Audience country','India').head(3)

Unnamed: 0,S.no,Instagram name,Name,Subscribers,Audience country,Authentic engagement\n,Engagement average\r\n,Category_1,Category_2
37,38,therock,Dwayne Johnson,335900000.0,India,375200.0,477500.0,Cinema & Actors/actresses,Fitness & Gym
39,40,narendramodi,Narendra Modi,69300000.0,India,1800000.0,2000000.0,Finance & Economics,Business & Careers
54,55,georginagio,Georgina Rodríguez,39100000.0,India,1500000.0,2100000.0,Lifestyle,Fashion


In [7]:
subframe=instagram.get_subcategory_items( instagram.df,'Category_1','Fashion')
dfs=instagram.find_topn_influencers(subframe,5)

In [8]:
print("top influencers based on "+instagram.metrics[0]+":" )
dfs[instagram.metrics[0]]

top influencers based on Subscribers:


Unnamed: 0,S.no,Instagram name,Name,Subscribers,Audience country,Authentic engagement\n,Engagement average\r\n,Category_1,Category_2
1,2,kyliejenner,Kylie 🤍,368100000.0,United States,3500000.0,5500000.0,Fashion,Modeling
10,11,kimkardashian,Kim Kardashian,329800000.0,United States,1200000.0,1600000.0,Fashion,Beauty
738,739,sonamkapoor,Sonam Kapoor Ahuja,33500000.0,India,80600.0,95300.0,Fashion,Modeling
647,648,taylor_hill,Taylor Hill,20500000.0,United States,121000.0,180500.0,Fashion,Modeling
742,743,troyesivan,troye sivan,13400000.0,United States,164100.0,235700.0,Fashion,Modeling


In [9]:
print("top influencers based on "+instagram.metrics[1]+":" )
dfs[instagram.metrics[1]]

top influencers based on Authentic engagement
:


Unnamed: 0,S.no,Instagram name,Name,Subscribers,Audience country,Authentic engagement\n,Engagement average\r\n,Category_1,Category_2
1,2,kyliejenner,Kylie 🤍,368100000.0,United States,3500000.0,5500000.0,Fashion,Modeling
10,11,kimkardashian,Kim Kardashian,329800000.0,United States,1200000.0,1600000.0,Fashion,Beauty
693,694,amadorat,N I C O L E A M A D O 🐉,8300000.0,Mexico,378800.0,423300.0,Fashion,Lifestyle
979,980,tejasswiprakash,Tejasswi Prakash,6300000.0,India,304400.0,376700.0,Fashion,Beauty
802,803,chaelincl,CL,10300000.0,Brazil,206300.0,272900.0,Fashion,Modeling


In [10]:
#Nested filtering based on audience country followed by product category 
subframe=instagram.get_subcategory_items( instagram.df,'Audience country','India')
subframe=instagram.get_subcategory_items( subframe,'Category_1','Fashion')
dfs=instagram.find_topn_influencers(subframe,5)

In [11]:
print("top influencers based on "+instagram.metrics[0]+":" )
print("List of top influencers:",instagram.get_influencer_fromdf(dfs[instagram.metrics[0]]))
dfs[instagram.metrics[0]]

top influencers based on Subscribers:
List of top influencers: ['Sonam Kapoor Ahuja', 'Tejasswi Prakash']


Unnamed: 0,S.no,Instagram name,Name,Subscribers,Audience country,Authentic engagement\n,Engagement average\r\n,Category_1,Category_2
738,739,sonamkapoor,Sonam Kapoor Ahuja,33500000.0,India,80600.0,95300.0,Fashion,Modeling
979,980,tejasswiprakash,Tejasswi Prakash,6300000.0,India,304400.0,376700.0,Fashion,Beauty
