# Code for Retrieving All Posts

In this section, we will retrieve all posts, including shared posts. The response will be in JSON format, and we will extract the important elements from the response and create a DataFrame with them.

## Importing Required Libraries

Before we begin, let's import the necessary libraries that we'll be using:


In [None]:
import requests
import pandas as pd
import json
import requests
import os
import datetime
import re
import sys
import csv

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Loading the Access Token (will be used for all requests)


In [None]:
access_token = "X"

## Setting up the headers/parameters/api url to get all the posts

Note several things:
- `headers_posts`:
    - Don't change

- `posts_params`:
    - `author`: 
    - `count`: how many results (posts) do we want returned
    - `q`: don't change
- `posts_url`:
    - This is the endpoint to retrieve data on posts (not reactions just posts themselves)
    

In [None]:

headers_posts = {
    'Authorization': f'Bearer {access_token}',
    'X-Restli-Protocol-Version': '2.0.0',
    'LinkedIn-Version': '202301',
    'X-RestLi-Method': 'FINDER',
}

posts_params = {
    'q': 'author',
    'author': 'urn:li:organization:X',
    'count': '100',
}

posts_url = f'https://api.linkedin.com/rest/posts'

## Lets define the post data request function

In [None]:
filename = 'list_posts.csv'

def append_data_to_csv(filename, data):
    df_new = pd.DataFrame(data)
    if os.path.exists(filename):
        df_existing = pd.read_csv(filename, quotechar='"')
        df_combined = pd.concat([df_existing, df_new])
        df_combined.drop_duplicates(subset='id', keep='first', inplace=True)
    else:
        df_combined = df_new
    df_combined.to_csv(filename, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

def get_data():
    for i in range(5):  #Attempt to get the data 5 times before printing the error message
        try:
            response_posts = requests.get(posts_url, headers=headers_posts, params=posts_params)
            #We check the response code, if its not a "200", we break the loop and print the error code.
            if response_posts.status_code == 200:
                response_posts_json = response_posts.json()
                print("Request successful!")
            else:
                print(f"Request unsuccessful. Status code: {response_posts.status_code}")
                break 

            #Adding another layer of protection in case the request is succesfull but the response format has changed or something like that.
            if 'elements' in response_posts_json:
                posts = response_posts_json['elements']
            else:
                print("No 'elements' key in the response. Check the API or the status of the request.")
                return
                
            fields_to_extract = ['createdAt', 'author', 'id', 'visibility']
            content_fields_to_extract = ['description', 'thumbnail', 'source', 'title']
            
            data = []
            for post in posts:
                post_data = {field: post.get(field) for field in fields_to_extract}
                content = post.get('content', {})
                if content:
                    article = content.get('article', {})
                    post_data.update({f'content_{field}': article.get(field, None) for field in content_fields_to_extract})
                data.append(post_data)
            
            append_data_to_csv(filename, data)
            
            break  # If the request was successful, break the loop
        except requests.exceptions.RequestException:
            time.sleep(2 ** i)  # Wait 2^i seconds before trying again
    else:
        print(f"Request failed with status code {response_posts.status_code}: {response_posts.text}")

if os.path.exists(filename):
    file_time = os.path.getmtime(filename)
    file_datetime = datetime.datetime.fromtimestamp(file_time)

    if (datetime.datetime.now() - file_datetime).total_seconds() > 60:  # Check if time difference is greater than 1 minute
        get_data()
    else:
        posts_df = pd.read_csv(filename, quotechar='"')
else:
    get_data()


In [None]:
#This version runs the function get_data if the time difference between now and the file date is needed in minutes
if os.path.exists(filename):
    file_time = os.path.getmtime(filename)
    file_datetime = datetime.datetime.fromtimestamp(file_time)

    if (datetime.datetime.now() - file_datetime).total_seconds() > 60:  # Check if time difference is greater than 1 minute
        get_data()
    else:
        posts_df = pd.read_csv(filename)

else:
    get_data()
    
#This version runs the function get_data if the time difference between now and the file date is needed in days 

if os.path.exists(filename):
    file_time = os.path.getmtime(filename)
    file_datetime = datetime.datetime.fromtimestamp(file_time)

    if (datetime.datetime.now() - file_datetime).days > 3: # Here we set the amount of days ie. if we only want to update every x days.
        get_data()
    else:
        posts_df = pd.read_csv(filename)

else:
    get_data()

## We now have a list of Post Ids that we will use to retrieve metadata per post ID.

Its important to understand that the API responds with all those posts that appear on the wall, that includes shares and internal content. We have designed a function to separate and save internal, external and full content.

In [None]:
filename="list_posts.csv"

def append_new_data(filename, df_new):
    if os.path.exists(filename):
        df_existing = pd.read_csv(filename)
        df_combined = pd.concat([df_existing, df_new])
        df_combined.drop_duplicates(subset='id', keep='first', inplace=True)
    else:
        df_combined = df_new
    df_combined.to_csv(filename, index=False)

def process_data():
    posts_internal_external = pd.read_csv(filename)

    share_pattern = r'urn:li:share:.+'
    ugcPost_pattern = r'urn:li:ugcPost:.+'

    mask_share = posts_internal_external['id'].str.match(share_pattern)
    mask_ugcPost = posts_internal_external['id'].str.match(ugcPost_pattern)

    df_share_new = posts_internal_external[mask_share]
    df_ugcPost_new = posts_internal_external[mask_ugcPost]

    append_new_data('shares.csv', df_share_new)
    append_new_data('UGCpost.csv', df_ugcPost_new)
    
process_data()

## Lets load the required csv that holds the Id's we need metadata from.

Note several things:
- `If we want only organic post data`:
    - load the df with csv file name "UGCpost.csv"
- `If we want only shared post data`:
    - - load the df with csv file name "shares.csv"
- `If we want everything together`:
    - - load the df with csv file name "list_posts.csv"

There can only be 1 of the 3 cells unlocked at once. 2 must be frozen for the script to work correctly.

### Only UGC posts (internal posts)

In [None]:
posts_df = pd.read_csv('UGCpost.csv')

### Only share posts (shared posts)

In [None]:
posts_df = pd.read_csv('shares.csv')

### All post (shares and ugc posts)

In [None]:
posts_df = pd.read_csv('list_posts.csv')

## We now convert the timestamps we receive from the API request to a format that is readable by us.

In [None]:
posts_df['createdAt'] = pd.to_datetime(posts_df['createdAt'], unit='ms')

In [None]:
list_post_id = posts_df['id'].dropna().unique().tolist()

## Setting up the headers, parameters, and API URL to get all the post metadata

Note several things:
- `metadata_headers`:
    - Don't change

- `metadata_url`:
    - This is the endpoint to retrieve data on posts (not reactions, just posts themselves), we will put this in a for loop to request all the list at the same time.
- `LinkedIn-Version`:
    - This is the API version. This might change with new endpoints needing a newer version, in any case we must include a valid version in the header.


In [None]:
metadata_headers = {
    'Authorization': f'Bearer {access_token}',
    'Connection': 'Keep-Alive',
    "Content-Type": "application/json",
    "LinkedIn-Version": "202301"
    
}

## Now we have the code to iterate through the post_id list and record the social reactions in a dataframe

We have also added functionality that downloads this dataframe to a csv file and for future runs, loads this file and only runs requests on posts we dont have. This is a temporary measure because were not allowed unlimited requests by the API. If we move to production, we can delete the reading of saved csv and request new data every day.


In [None]:
filename = 'post_with_metadata_response.csv'

if os.path.exists(filename):
    df_responses = pd.read_csv(filename)
else:
    df_responses = pd.DataFrame(columns=['id', 'response'])
    
fetched_ids = df_responses['id'].tolist()
new_post_ids = [id for id in list_post_id if id not in fetched_ids]
    
if not new_post_ids:  # Check if the list is empty
    print("All the posts have metadata, there's no need to request any additional information.")
else:
    counter = 0
    for post_id in new_post_ids:
        metadata_url = f'https://api.linkedin.com/rest/socialMetadata/{post_id}'
        metadata_response = requests.get(metadata_url, headers=metadata_headers)

        if metadata_response.status_code == 200:
            response_data = metadata_response.json()
            df_responses.loc[len(df_responses)] = {'id': post_id, 'response': response_data}
            counter += 1
        else:
            print(f"Failed to retrieve metadata for post ID: {post_id}, you've requested {counter} posts today and you probably reached maximum API quota for today.")
            break 

    df_responses.to_csv(filename, index=False)

## We are going to adjust the newly created dataframe to have the data correctly structured

Our code from before will just update the csv with 2 columns, id and response. In this code, we take that csv and structure it correctly then overwrite the csv.

In [None]:
df_responses = pd.read_csv('post_with_metadata_response.csv')

In [None]:
if isinstance(df_responses.loc[0, 'response'], str):
    import json
    df_responses['response'] = df_responses['response'].apply(lambda x: json.loads(x.replace("'", "\"")))

df_responses_normalized = pd.json_normalize(df_responses['response'])
df_responses_with_post_id = pd.concat([df_responses['id'], df_responses_normalized], axis=1)

In [None]:
columns_to_drop = ['entity', 'reactionSummaries.PRAISE.reactionType',
                   'reactionSummaries.APPRECIATION.reactionType',
                   'reactionSummaries.LIKE.reactionType',
                   'reactionSummaries.INTEREST.reactionType',
                   'reactionSummaries.EMPATHY.reactionType',
                   'reactionSummaries.ENTERTAINMENT.reactionType',
                   'reactionSummaries.MAYBE.reactionType']

for column in columns_to_drop:
    if column in df_responses_with_post_id.columns:
        df_responses_with_post_id.drop(column, axis=1, inplace=True)

In [None]:
column_rename_mapping = {
    'reactionSummaries.PRAISE.count': 'Praise Reaction Count',
    'reactionSummaries.APPRECIATION.count': 'Appreciation Reaction Count',
    'reactionSummaries.LIKE.count': 'Like Reaction Count',
    'reactionSummaries.INTEREST.count': 'Interest Reaction Count',
    'reactionSummaries.EMPATHY.count': 'Empathy Reaction Count',
    'reactionSummaries.ENTERTAINMENT.count': 'Entertainment Reaction Count',
    'commentSummary.count': 'Comment Count',
    'commentSummary.topLevelCount': 'Top Level Comment Count',
    'reactionSummaries.MAYBE.count': 'Maybe Reaction Count'
}

for old_column, new_column in column_rename_mapping.items():
    if old_column in df_responses_with_post_id.columns:
        df_responses_with_post_id.rename(columns={old_column: new_column}, inplace=True)

## We will format the data further and then merge dataframes to have all the data in one csv file

- **First**
  - We merge post metadata with post data

- **Second**
  - Check if theres difference between the loaded metadata csv and the one created by merging the two previous dataframes

- **Third**
  - If a change is detected, we overwrite with the newest information, if the dfs are the same, we do nothing.
 


In [None]:
# Assuming your merged dataframe
metadata_per_post = pd.merge(posts_df, df_responses_with_post_id, on='id', how='outer')

# Specify file path
file_path = "Raw Data CSVs//metadata_per_post.csv" 

# Check if the file already exists
if os.path.isfile(file_path):
    # Load the existing data
    existing_df = pd.read_csv(file_path)

    # Check for differences
    if not metadata_per_post.equals(existing_df):
        # If differences are found, update (overwrite) the file
        metadata_per_post.to_csv(file_path, index=False)
        print("CSV file updated.")
    else:
        print("No differences found. CSV file not updated.")
else:
    # If the file doesn't exist, create it
    metadata_per_post.to_csv(file_path, index=False)
    print("CSV file created.")