# 1.0 Import Dependencies

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import datetime as dt

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# 2.0 Load Data

In [157]:
#Load all data from bacth 1 untul 5
data_batch_1 = pd.read_csv(r'data_batch_1.csv')
data_batch_2 = pd.read_csv(r'data_batch_2.csv')
data_batch_3 = pd.read_csv(r'data_batch_3.csv')
data_batch_4 = pd.read_csv(r'data_batch_4.csv')
data_batch_5 = pd.read_csv(r'data_batch_5.csv')
data_batch_6 = pd.read_csv(r'data_batch_6.csv')

#general information data
data_general = pd.read_csv(r'influencer_names_final.csv')

In [158]:
#concat all data
data_detail = pd.concat([data_batch_1,
           data_batch_2,
           data_batch_3,
           data_batch_4,
           data_batch_5,
           data_batch_6],
          ignore_index=True)

In [159]:
#join with general data
df = pd.merge(data_detail, data_general, how='left')

In [160]:
#top 5 our data
df.head()

Unnamed: 0,link,likes,comment_counts,dates,captions,type_posts,username,following,post,followers
0,https://www.instagram.com/p/CCXR2t6BN_6/,154552,0,1594174009,"•\nWho Said We Are Old...No We Are Blessed,\n....",GraphImage,princessyahrini,649,4769,33500000
1,https://www.instagram.com/p/CCNHaLhBndL/,52771,9,1593833275,“ How We Miss Tokyo So Much “\n\n_____________...,GraphVideo,princessyahrini,649,4769,33500000
2,https://www.instagram.com/p/CCFU-mFhKJi/,97386,0,1593571666,"•\nOh...Hi July,\nGood Morning !\n\n__________...",GraphImage,princessyahrini,649,4769,33500000
3,https://www.instagram.com/p/CCDUy0rhY4U/,43953,5,1593505188,"From Our Honeymoon,\nSummer 2019 !\n\n________...",GraphVideo,princessyahrini,649,4769,33500000
4,https://www.instagram.com/p/CB4WqVRBAOP/,145632,0,1593136341,•\nSelamat pagi !\n\n__________________ 𝓢𝓨𝓡___...,GraphImage,princessyahrini,649,4769,33500000


In [161]:
#Initially i have 65.998 rows and 10 features
df.shape

(65998, 10)

# 3.0 Data Preprocessing Stage

## 3.1 Convert Feature Dates
- because the dates format is in epochs, so we must convert it to general datetime format
- the new format is year-month-day-hour when the post is being posted by influencers

In [162]:
df['dates'] = df['dates'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d-%H'))

In [163]:
print(df['dates'].min())
print(df['dates'].max())

2014-12-01-23
2020-07-12-20


## 3.2 Cleansing Features Captions

In [164]:
df['captions'] = df['captions'].replace(r'[\n]', '', regex=True)

In [165]:
df.isnull().sum()

link                0
likes               0
comment_counts      0
dates               0
captions          303
type_posts          0
username            0
following           0
post                0
followers           0
dtype: int64

In [166]:
#fill missing value in features captions
df['captions'] = df['captions'].fillna('No Captions')

# 4.0 Feature Engineering Stage 1
- this is my own definition related to engagement.
    - likes_engagement = number of likes each post / number of followers
    - comments_engagement = number of comments each post / number of followers
    - total_engagement = likes_engagement + comments_engagement
    - our objective is to predict total engagemnt

## 4.1 Feature dates

In [167]:
#separte for future use
df['year'] = df['dates'].apply(lambda x: x[:4])
df['month'] = df['dates'].apply(lambda x: x[5:7])
df['day'] = df['dates'].apply(lambda x: x[8:10])
df['hour'] = df['dates'].apply(lambda x: x[11:])

#create year_month features
df['year_month'] = df['year'] + df['month']

In [168]:
#because the count of post before 2018 is small, we only filter post that >= 2018
#and our objective is to predict avg engagment rate in July 2020, so we must exclude July 2020 too.
df_2 = df[df['year']>='2018'].reset_index(drop=True)
df_3 = df_2[df_2['year_month']!='202007'].reset_index(drop=True)

## 4.2 Feature Engagement

In [169]:
df_3['likes_engagement'] = df_3['likes'] / df_3['followers'] * 100
df_3['comments_engagement'] = df_3['comment_counts'] / df_3['followers'] * 100

df_3['total_engagement'] = df_3['likes_engagement'] + df_3['comments_engagement']

## 4.3 Feature Captions

In [170]:
#make lower case
df_3['captions'] = df_3['captions'].str.lower()

#delete whitespace
df_3['captions'] = df_3['captions'].str.strip()

In [171]:
#feature length of captions
df_3['len_capt'] = df_3['captions'].apply(lambda x: len(x))

#create feature length of captions without punctuation
df_3['len_capt_wo_punct'] = df_3['captions'].str.replace(r'[^\w\s]', '', regex=True).apply(lambda x: len(x))

#feature length of punctuation only
df_3['len_capt_punct'] = df_3['captions'].str.replace(r'[\w\s]', '', regex=True).apply(lambda x: len(x))

In [175]:
#feature number of words in each captions
df_3['n_words'] = df_3['captions'].apply(lambda x: len(str(x).split(" ")))

#average char per words
df_3['avg_char_words'] = df_3['n_words'] / df_3['len_capt']

#number of numeric char in captions
df_3['n_numeric'] = df_3['captions'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [176]:
df_3.head()

Unnamed: 0,link,likes,comment_counts,dates,captions,type_posts,username,following,post,followers,year,month,day,hour,year_month,likes_engagement,comments_engagement,total_engagement,len_capt,len_capt_wo_punct,len_capt_punct,n_words,avg_char_words,n_numeric
0,https://www.instagram.com/p/CCDUy0rhY4U/,43953,5,2020-06-30-15,"from our honeymoon,summer 2019 !______________...",GraphVideo,princessyahrini,649,4769,33500000,2020,6,30,15,202006,0.131203,1.5e-05,0.131218,81,78,3,6,0.074074,1
1,https://www.instagram.com/p/CB4WqVRBAOP/,145632,0,2020-06-26-08,•selamat pagi !__________________ 𝓢𝓨𝓡_________...,GraphImage,princessyahrini,649,4769,33500000,2020,6,26,8,202006,0.434722,0.0,0.434722,72,69,3,4,0.055556,0
2,https://www.instagram.com/p/CB0IFkKh0Py/,79662,65,2020-06-24-17,“ i have nothing “_________________ 𝓢𝓨𝓡_______...,GraphVideo,princessyahrini,649,4769,33500000,2020,6,24,17,202006,0.237797,0.000194,0.237991,71,68,3,6,0.084507,0
3,https://www.instagram.com/p/CBzudkKBc5v/,76461,0,2020-06-24-13,•a day well spent with my beautiful soul siste...,GraphImage,princessyahrini,649,4769,33500000,2020,6,24,13,202006,0.228242,0.0,0.228242,230,219,11,23,0.1,0
4,https://www.instagram.com/p/CBvH-ZaB5c5/,141927,0,2020-06-22-18,"•alhamdulillah,amin allahuma amin ... amin ya ...",GraphVideo,princessyahrini,649,4769,33500000,2020,6,22,18,202006,0.423663,0.0,0.423663,115,108,7,9,0.078261,0


In [177]:
#from our stage 1 preprocessing and feature engineering, we can create 14 new features.
df_3.shape

(52675, 24)

In [178]:
df_3.to_csv('data_preprocessing_1.csv', index=False)