# Analyze comments per post

In [6]:
import pandas as pd
from facebook_scraper import get_posts

## Get data from Facebook

In [5]:
#Function to retrieve data from timeline
def data_from_timeline(user_name, start_date):
    '''
    Capture post information from specific account. From start date to most recent available posts.
    
    Returns dataframe containing timestamp, post ID, post URL, post text, comment count, like count and share count.
    
    Parameters: 
     - user_name: the ID of the account without @ (str)
     - start_date: starting date 'yyyy-mm-dd' (str)
    '''

    #Create lists to capture timestamp, post ID, post URL, post text, comment count, like count and share count
    timestamp_post = []
    post_id = []
    post_url = []
    post_text = []
    comments = []
    likes = []
    shares = []

    #Loop through posts and capture information
    for post in get_posts(account=user_name, page_limit=None):
        
        if (post['time'].date() > pd.to_datetime(start_date).date()):
            
            #Capture needed elements
            timestamp_post.append(post['time'])
            post_id.append(post['post_id'])
            post_url.append(post['post_url'])
            post_text.append(post['text'])
            comments.append(post['comments'])
            likes.append(post['likes']) 
            shares.append(post['shares'])
            
        else:
            break
    
    #Create df from lists 
    final_df = pd.DataFrame({'timestamp_post': timestamp_post,
                             'post_id' : post_id,
                             'post_url':  post_url,
                             'post_text': post_text,
                             'comments': comments,
                             'likes': likes,
                             'shares': shares
                             }
                            )
    return final_df

In [58]:
df = data_from_timeline(user_name='ministeriebz',
                        start_date='2020-07-01'
                       )

In [59]:
df.head()

Unnamed: 0,timestamp_post,post_id,post_url,post_text,comments,likes,shares
0,2020-10-25 14:31:23,2619278935049430,https://facebook.com/story.php?story_fbid=2619...,Je kunt weer solliciteren voor de opleiding In...,0,16,0
1,2020-10-24 08:59:45,2617907008519956,https://facebook.com/story.php?story_fbid=2617...,🇺🇳 Vanuit het Vredespaleis vieren we vandaag 7...,1,10,0
2,2020-10-23 17:26:19,2617272898583367,https://facebook.com/watch?v=1298247357191438,Jongeren wereldwijd worden hard getroffen door...,1,15,0
3,2020-10-23 11:45:23,2617020261941964,https://facebook.com/story.php?story_fbid=2617...,Vanaf vandaag kun je weer solliciteren voor de...,0,5,3
4,2020-10-23 09:02:19,2616928531951137,https://facebook.com/story.php?story_fbid=2616...,Tijdens een grote crisis werken een ambassade ...,0,16,0


In [60]:
df.tail()

Unnamed: 0,timestamp_post,post_id,post_url,post_text,comments,likes,shares
93,2020-07-07 19:32:00,2523738244603500,https://facebook.com/story.php?story_fbid=2523...,🇪🇸 Denk jij aan een vakantie in Spanje? Ga dan...,2,18,15
94,2020-07-07 13:51:00,2523528241291167,https://facebook.com/story.php?story_fbid=2523...,🍅 Hoe kweek je tomaten midden in de woestijn?\...,0,19,0
95,2020-07-05 13:43:06,2521840398126618,https://facebook.com/story.php?story_fbid=2521...,🔎 Buitenlandse Zaken zoekt senior HR-beleidsad...,3,11,0
96,2020-07-03 10:30:03,2520058454971479,https://facebook.com/story.php?story_fbid=2520...,Wil je naar Italië deze zomer? Neem dan deze t...,20,27,11
97,2020-07-02 18:24:36,2519535138357144,https://facebook.com/story.php?story_fbid=2519...,Tijdens de coronacrisis zat Anita Dam achter h...,0,51,12


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   timestamp_post  98 non-null     datetime64[ns]
 1   post_id         98 non-null     object        
 2   post_url        97 non-null     object        
 3   post_text       98 non-null     object        
 4   comments        98 non-null     int64         
 5   likes           98 non-null     int64         
 6   shares          98 non-null     int64         
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 5.5+ KB


## Pre-process data

In [7]:
#Filter posts based on specific text
def search_posts(dataf, search_text):
    """
    Parameters: 
     - dataf: dataframe to filter
     - search_text: text to search for in post text
    """

    return dataf[dataf['post_text'].str.contains(search_text)]

In [66]:
clean_df = df.pipe(search_posts, 'Duitsland')

In [67]:
clean_df

Unnamed: 0,timestamp_post,post_id,post_url,post_text,comments,likes,shares
25,2020-10-07 20:49:27,2602898566687467,https://facebook.com/story.php?story_fbid=2602...,Reizigers naar Duitsland moeten een negatieve ...,25,19,0
29,2020-10-03 14:46:08,2599302417047082,https://facebook.com/story.php?story_fbid=2599...,"🇩🇪 Woon je in Nederland, maar werk je in Duits...",80,33,0
30,2020-10-02 20:52:49,2598682447109079,https://facebook.com/story.php?story_fbid=2598...,🇩🇪 Heb je plannen om naar Duitsland te gaan? L...,635,57,0
88,2020-07-13 16:28:11,2528553890788602,https://facebook.com/story.php?story_fbid=2528...,🇩🇪 Zin in vakantie bij onze oosterburen? Ga wi...,0,20,7


# Analyze

In [68]:
#Sum the total comments for all posts
clean_df['comments'].sum()

740