# **Importing Libraries and Loading the Dataset**

In [1]:
# Importing necessary libraries for data analysis and visualization
import pandas as pd                      # For data manipulation and analysis
import matplotlib.pyplot as plt          # For data visualization
import numpy as np                       # For numerical operations
import seaborn as sns                    # For enhanced data visualization
from scipy.stats import zscore           # For calculating Z-scores
import statsmodels.api as sm             # For regression modeling
from statsmodels.stats.outliers_influence import variance_inflation_factor  # For calculating VIF
from statsmodels.tools.tools import add_constant  # To add intercept term for regression
import re                                # For regular expression operations

In [2]:
# Load the Instagram dataset from an Excel file
data=pd.read_excel('instagram_data.xlsx')

In [3]:
# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,DATE,"TYPE(1 PHOTO,2 VIDEO)",USERS IN PHOTO,LINK,list_of_tags,number_of_tags,list_of_mentions,number_of_mentions
0,georgiou82,1134619,915,18560,95,Every day is a #newday and every experience is...,2017-02-11 11:05:10,1,1,https://www.instagram.com/p/BQXZpSQjiPj,#newday #newera,2,,0
1,georgiou82,1134619,915,17710,86,#goodnight !!! The #weekend is ahead!!! Have a...,2017-02-11 00:33:10,1,1,https://www.instagram.com/p/BQWRUV6j8b-,#goodnight #weekend #goodone,3,,0
2,georgiou82,1134619,915,11959,79,#tb #2007 #stous31dromous #megatv #greece #new...,2017-02-09 15:51:47,1,3,https://www.instagram.com/p/BQSw2wMj0Bw,#tb #2007 #stous31dromous #megatv #greece #new...,7,,0
3,georgiou82,1134619,915,13608,37,Cold weather brings people together.... #filmi...,2017-02-08 18:21:57,1,3,https://www.instagram.com/p/BQQdPrLDeRz,#filming #mprousko4,2,@stamosts @kinolis2,2
4,georgiou82,1134619,915,15842,62,The youngest hearts might be the oldest souls....,2017-02-08 13:03:14,1,0,https://www.instagram.com/p/BQP4xY7jAXb,#lifelessons #andreasgeorgiou #tb #peru,4,,0


In [4]:
# Display the last few rows of the dataset to understand its structure
data.tail()

Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,DATE,"TYPE(1 PHOTO,2 VIDEO)",USERS IN PHOTO,LINK,list_of_tags,number_of_tags,list_of_mentions,number_of_mentions
19676,kourtideo,17993,7501,615,0,Earlier today ???❤️ \| #ktima #nisis #top,2017-04-22 23:28:29,1,-,https://www.instagram.com/p/BTM3fOADPxz/,#ktima #nisis #top,3,,0
19677,kourtideo,17993,7501,505,0,☕️?? \| #Sunday #relax #espresso,2017-04-23 20:36:10,1,-,https://www.instagram.com/p/BTPIkBQDbnt/,#Sunday #relax #espresso,3,,0
19678,kourtideo,17993,7501,829,0,That's what is about ??☀️?? \| #nammos #mykono...,2017-04-24 18:42:39,1,-,https://www.instagram.com/p/BTRgXe2DA4z/,#nammos #mykonos #summer #loading #Kourtour,5,,0
19679,kourtideo,17993,7501,660,1,Relaxing after work ☕️☀️? \| #coffee #sourtouk...,2017-04-25 17:01:32,1,-,https://www.instagram.com/p/BTT5lykj_UD/,#coffee #sourtouki #glyfada #ark,4,,0
19680,kourtideo,17993,7501,598,0,?☀️?? \| #lake #ippokrateiospoliteia,2017-04-26 19:05:23,1,-,https://www.instagram.com/p/BTWsjzXj0M4/,#lake #ippokrateiospoliteia,2,,0


In [5]:
# Display the basic information about the dataset, including data types and non-null counts
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19681 entries, 0 to 19680
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   USERNAME               19681 non-null  object        
 1   FOLLOWERS              19681 non-null  int64         
 2   FOLLOWING              19681 non-null  int64         
 3   LIKES                  19681 non-null  int64         
 4   COMMENTS               19681 non-null  int64         
 5   TEXT                   19675 non-null  object        
 6   DATE                   19681 non-null  datetime64[ns]
 7   TYPE(1 PHOTO,2 VIDEO)  19681 non-null  int64         
 8   USERS IN PHOTO         19681 non-null  object        
 9   LINK                   19681 non-null  object        
 10  list_of_tags           13862 non-null  object        
 11  number_of_tags         19681 non-null  int64         
 12  list_of_mentions       6746 non-null   object        
 13  n

# **Data Cleaning and Preprocessing**

In [6]:
# Calculate the total number of records in the dataset
print(f"Total records: {len(data)}")

Total records: 19681


In [7]:
# Define numeric and categorical features for analysis
numeric_features = [
    'FOLLOWERS', 'FOLLOWING', 'LIKES', 'COMMENTS', 
    'number_of_tags', 'number_of_mentions'
]

In [8]:
# Calculate descriptive statistics
numeric_stats = data[numeric_features].describe()
numeric_stats

Unnamed: 0,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,number_of_tags,number_of_mentions
count,19681.0,19681.0,19681.0,19681.0,19681.0,19681.0
mean,62564.13,1489.766831,2497.766983,39.825111,6.737005,0.723591
std,104234.9,2252.675356,5574.988136,447.972795,8.782144,1.704316
min,17993.0,0.0,0.0,0.0,0.0,0.0
25%,23299.0,174.0,420.0,1.0,0.0,0.0
50%,36699.0,506.0,1073.0,5.0,3.0,0.0
75%,62791.0,1367.0,2683.0,17.0,10.0,1.0
max,1134619.0,7586.0,158338.0,26011.0,41.0,34.0


In [9]:
# Define the list of categorical variables
categorical_columns = ['USERNAME', 'TEXT', 'DATE', 'USERS IN PHOTO', 'LINK', 'list_of_tags', 'list_of_mentions']

# Check unique values for categorical features
unique_counts_dict = data[categorical_columns].nunique()
unique_counts_dataframe = unique_counts_dict.reset_index()
unique_counts_dataframe.columns = ['Category', 'Unique_Values']

# Sort the dataframe by the number of unique values
sorted_unique_counts = unique_counts_dataframe.sort_values(by='Unique_Values', ascending=True)
sorted_unique_counts

Unnamed: 0,Category,Unique_Values
3,USERS IN PHOTO,22
0,USERNAME,1094
6,list_of_mentions,4357
5,list_of_tags,9683
1,TEXT,17390
2,DATE,19538
4,LINK,19681


In [10]:
# Handle missing values by dropping rows or columns
print("Missing values:\n", data.isnull().sum())

Missing values:
 USERNAME                     0
FOLLOWERS                    0
FOLLOWING                    0
LIKES                        0
COMMENTS                     0
TEXT                         6
DATE                         0
TYPE(1 PHOTO,2 VIDEO)        0
USERS IN PHOTO               0
LINK                         0
list_of_tags              5819
number_of_tags               0
list_of_mentions         12935
number_of_mentions           0
dtype: int64


In [11]:
# Check for duplicate records
duplicates = data.duplicated().sum()
print(f"Number of duplicate records: {duplicates}")

Number of duplicate records: 0


In [12]:
# Drop missing values and irrelevant columns
data = data.dropna(subset=['TEXT'])

In [13]:
data = data.dropna(axis=1)

In [14]:
# Check for missing values
print("Missing values:\n", data.isnull().sum())

Missing values:
 USERNAME                 0
FOLLOWERS                0
FOLLOWING                0
LIKES                    0
COMMENTS                 0
TEXT                     0
DATE                     0
TYPE(1 PHOTO,2 VIDEO)    0
USERS IN PHOTO           0
LINK                     0
number_of_tags           0
number_of_mentions       0
dtype: int64


In [15]:
# Add new features: Month, Day of the Week, and Post Timing
data['Month'] = data['DATE'].dt.month
data['Day_of_Week'] = data['DATE'].dt.day_name()

# Categorize post timing into Morning, Afternoon, Evening, and Night
def categorize_post_timing(hour):
    if 8 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 16:
        return 'Afternoon'
    elif 16 <= hour < 20:
        return 'Evening'
    elif 20 <= hour <= 23 or 0 <= hour < 8:
        return 'Night'

data['Post_Timing'] = data['DATE'].dt.hour.apply(categorize_post_timing)

In [16]:
# Calculate the length of the text in each post
data['Text_Length'] = data['TEXT'].apply(lambda x: len(x.strip()))

In [17]:
data.head()

Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,DATE,"TYPE(1 PHOTO,2 VIDEO)",USERS IN PHOTO,LINK,number_of_tags,number_of_mentions,Month,Day_of_Week,Post_Timing,Text_Length
0,georgiou82,1134619,915,18560,95,Every day is a #newday and every experience is...,2017-02-11 11:05:10,1,1,https://www.instagram.com/p/BQXZpSQjiPj,2,0,2,Saturday,Morning,56
1,georgiou82,1134619,915,17710,86,#goodnight !!! The #weekend is ahead!!! Have a...,2017-02-11 00:33:10,1,1,https://www.instagram.com/p/BQWRUV6j8b-,3,0,2,Saturday,Night,64
2,georgiou82,1134619,915,11959,79,#tb #2007 #stous31dromous #megatv #greece #new...,2017-02-09 15:51:47,1,3,https://www.instagram.com/p/BQSw2wMj0Bw,7,0,2,Thursday,Afternoon,60
3,georgiou82,1134619,915,13608,37,Cold weather brings people together.... #filmi...,2017-02-08 18:21:57,1,3,https://www.instagram.com/p/BQQdPrLDeRz,2,2,2,Wednesday,Evening,103
4,georgiou82,1134619,915,15842,62,The youngest hearts might be the oldest souls....,2017-02-08 13:03:14,1,0,https://www.instagram.com/p/BQP4xY7jAXb,4,0,2,Wednesday,Afternoon,91


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19675 entries, 0 to 19680
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   USERNAME               19675 non-null  object        
 1   FOLLOWERS              19675 non-null  int64         
 2   FOLLOWING              19675 non-null  int64         
 3   LIKES                  19675 non-null  int64         
 4   COMMENTS               19675 non-null  int64         
 5   TEXT                   19675 non-null  object        
 6   DATE                   19675 non-null  datetime64[ns]
 7   TYPE(1 PHOTO,2 VIDEO)  19675 non-null  int64         
 8   USERS IN PHOTO         19675 non-null  object        
 9   LINK                   19675 non-null  object        
 10  number_of_tags         19675 non-null  int64         
 11  number_of_mentions     19675 non-null  int64         
 12  Month                  19675 non-null  int32         
 13  Day_of

In [21]:
# Rename the 'USERS IN PHOTO' column for better clarity
data.rename(columns={'USERS IN PHOTO': 'Tagged_Users_Count'}, inplace=True)
# Convert Tagged_Users_Count to integer after replacing invalid values
data['Tagged_Users_Count'] = data['Tagged_Users_Count'].replace('-', 0)
data['Tagged_Users_Count'] = data['Tagged_Users_Count'].astype(int)

In [22]:
# Checking unique values in the Month column
data['Month'].unique()

array([ 2,  1, 12, 11, 10,  9,  8,  7,  4,  5,  3])

In [23]:
# Checking unique values in the Day of Week column
data['Day_of_Week'].unique()

array(['Saturday', 'Thursday', 'Wednesday', 'Tuesday', 'Monday', 'Friday',
       'Sunday'], dtype=object)

In [24]:
# Checking unique values in the Post Timing column
data['Post_Timing'].unique()

array(['Morning', 'Night', 'Afternoon', 'Evening'], dtype=object)

In [25]:
# Checking unique values in the TYPE (PHOTO, VIDEO) column
data['TYPE(1 PHOTO,2 VIDEO)'].unique()

array([1, 2], dtype=int64)

In [26]:
# Creating the Month Name column from the Month column
data['Month_Name'] = pd.to_datetime(data['Month'], format='%m').dt.month_name()

# Replacing 1 by 'photo' and 2 by 'video' in the TYPE column
data['TYPE(1 PHOTO,2 VIDEO)'] = data['TYPE(1 PHOTO,2 VIDEO)'].map({1: 'photo', 2: 'video'})


In [27]:
# Create dummy variables for categorical columns
data = pd.get_dummies(data, columns=['Month_Name', 'Day_of_Week', 'Post_Timing', 'TYPE(1 PHOTO,2 VIDEO)'], drop_first=True)

In [28]:
data.head()

Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,DATE,Tagged_Users_Count,LINK,number_of_tags,...,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday,Post_Timing_Evening,Post_Timing_Morning,Post_Timing_Night,"TYPE(1 PHOTO,2 VIDEO)_video"
0,georgiou82,1134619,915,18560,95,Every day is a #newday and every experience is...,2017-02-11 11:05:10,1,https://www.instagram.com/p/BQXZpSQjiPj,2,...,False,True,False,False,False,False,False,True,False,False
1,georgiou82,1134619,915,17710,86,#goodnight !!! The #weekend is ahead!!! Have a...,2017-02-11 00:33:10,1,https://www.instagram.com/p/BQWRUV6j8b-,3,...,False,True,False,False,False,False,False,False,True,False
2,georgiou82,1134619,915,11959,79,#tb #2007 #stous31dromous #megatv #greece #new...,2017-02-09 15:51:47,3,https://www.instagram.com/p/BQSw2wMj0Bw,7,...,False,False,False,True,False,False,False,False,False,False
3,georgiou82,1134619,915,13608,37,Cold weather brings people together.... #filmi...,2017-02-08 18:21:57,3,https://www.instagram.com/p/BQQdPrLDeRz,2,...,False,False,False,False,False,True,True,False,False,False
4,georgiou82,1134619,915,15842,62,The youngest hearts might be the oldest souls....,2017-02-08 13:03:14,0,https://www.instagram.com/p/BQP4xY7jAXb,4,...,False,False,False,False,False,True,False,False,False,False


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19675 entries, 0 to 19680
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   USERNAME                     19675 non-null  object        
 1   FOLLOWERS                    19675 non-null  int64         
 2   FOLLOWING                    19675 non-null  int64         
 3   LIKES                        19675 non-null  int64         
 4   COMMENTS                     19675 non-null  int64         
 5   TEXT                         19675 non-null  object        
 6   DATE                         19675 non-null  datetime64[ns]
 7   Tagged_Users_Count           19675 non-null  int32         
 8   LINK                         19675 non-null  object        
 9   number_of_tags               19675 non-null  int64         
 10  number_of_mentions           19675 non-null  int64         
 11  Month                        19675 non-null  i

In [31]:
columns_to_convert = ['TYPE(1 PHOTO,2 VIDEO)_video','Post_Timing_Evening','Post_Timing_Morning','Post_Timing_Night','Day_of_Week_Monday','Day_of_Week_Saturday','Day_of_Week_Sunday', 'Day_of_Week_Thursday','Day_of_Week_Tuesday','Day_of_Week_Wednesday','Month_Name_August','Month_Name_December','Month_Name_February','Month_Name_January','Month_Name_July','Month_Name_March','Month_Name_May','Month_Name_November','Month_Name_October','Month_Name_September']
data[columns_to_convert] = data[columns_to_convert].astype(int)

In [32]:
data.head()

Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,DATE,Tagged_Users_Count,LINK,number_of_tags,...,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday,Post_Timing_Evening,Post_Timing_Morning,Post_Timing_Night,"TYPE(1 PHOTO,2 VIDEO)_video"
0,georgiou82,1134619,915,18560,95,Every day is a #newday and every experience is...,2017-02-11 11:05:10,1,https://www.instagram.com/p/BQXZpSQjiPj,2,...,0,1,0,0,0,0,0,1,0,0
1,georgiou82,1134619,915,17710,86,#goodnight !!! The #weekend is ahead!!! Have a...,2017-02-11 00:33:10,1,https://www.instagram.com/p/BQWRUV6j8b-,3,...,0,1,0,0,0,0,0,0,1,0
2,georgiou82,1134619,915,11959,79,#tb #2007 #stous31dromous #megatv #greece #new...,2017-02-09 15:51:47,3,https://www.instagram.com/p/BQSw2wMj0Bw,7,...,0,0,0,1,0,0,0,0,0,0
3,georgiou82,1134619,915,13608,37,Cold weather brings people together.... #filmi...,2017-02-08 18:21:57,3,https://www.instagram.com/p/BQQdPrLDeRz,2,...,0,0,0,0,0,1,1,0,0,0
4,georgiou82,1134619,915,15842,62,The youngest hearts might be the oldest souls....,2017-02-08 13:03:14,0,https://www.instagram.com/p/BQP4xY7jAXb,4,...,0,0,0,0,0,1,0,0,0,0


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19675 entries, 0 to 19680
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   USERNAME                     19675 non-null  object        
 1   FOLLOWERS                    19675 non-null  int64         
 2   FOLLOWING                    19675 non-null  int64         
 3   LIKES                        19675 non-null  int64         
 4   COMMENTS                     19675 non-null  int64         
 5   TEXT                         19675 non-null  object        
 6   DATE                         19675 non-null  datetime64[ns]
 7   Tagged_Users_Count           19675 non-null  int32         
 8   LINK                         19675 non-null  object        
 9   number_of_tags               19675 non-null  int64         
 10  number_of_mentions           19675 non-null  int64         
 11  Month                        19675 non-null  i

# **Descriptive Statistics and Outlier Detection**

In [35]:
from scipy.stats import zscore

# Columns to analyze for outliers
columns_of_interest = [
    'FOLLOWERS', 'FOLLOWING', 'LIKES', 'COMMENTS',
    'Tagged_Users_Count', 'number_of_tags',
    'number_of_mentions', 'Text_Length'
]

In [36]:
# Calculate Z-scores for numeric features to identify outliers
z_scores = data[columns_of_interest].apply(zscore)
# Create a mask for rows where any column's Z-score exceeds 3
outlier_mask = (z_scores.abs() > 3).any(axis=1)
# Filter out the outliers
data_cleaned = data[~outlier_mask]

# Display the number of rows before and after outlier removal
print(f"Original number of rows: {data.shape[0]}")
print(f"Number of rows after outlier removal: {data_cleaned.shape[0]}")


Original number of rows: 19675
Number of rows after outlier removal: 18276


# **Correlation Analysis**

In [37]:
# Select relevant columns from the dataset for calculating the correlation matrix
# These columns include numeric features and dummy variables representing categorical data
columns_for_correlation_matrix = data[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September']]
# Compute the correlation matrix to evaluate the relationships between the selected features
correlation_matrix = columns_for_correlation_matrix.corr()
# Display the correlation matrix
correlation_matrix

Unnamed: 0,FOLLOWERS,FOLLOWING,Tagged_Users_Count,number_of_tags,number_of_mentions,Text_Length,"TYPE(1 PHOTO,2 VIDEO)_video",Post_Timing_Evening,Post_Timing_Morning,Post_Timing_Night,...,Month_Name_August,Month_Name_December,Month_Name_February,Month_Name_January,Month_Name_July,Month_Name_March,Month_Name_May,Month_Name_November,Month_Name_October,Month_Name_September
FOLLOWERS,1.0,-0.093496,0.145211,-0.043321,-0.000932,-0.007532,-0.007427,0.001737,-0.015098,-0.015618,...,0.305785,0.039529,0.153487,0.228232,0.222446,-0.005591,-0.021694,0.222446,0.28722,0.385464
FOLLOWING,-0.093496,1.0,0.021503,0.10725,0.102484,0.056984,-0.054276,-0.055901,0.010759,0.083858,...,-0.007506,-0.005398,-4.6e-05,-0.008688,-0.00546,0.000514,0.014776,-0.00546,-0.00705,-0.009462
Tagged_Users_Count,0.145211,0.021503,1.0,0.11228,0.076426,0.084958,-0.099426,0.013546,-0.012141,-0.0259,...,0.002451,-0.0018,-0.001107,0.010644,-0.003118,-0.005147,-0.003442,0.025918,0.021403,0.051475
number_of_tags,-0.043321,0.10725,0.11228,1.0,0.107736,0.602237,-0.0056,0.007898,-0.019475,0.066102,...,0.033767,-0.001508,-0.016149,-0.01134,0.017416,-0.00352,0.00889,-0.012626,-0.008195,-0.003276
number_of_mentions,-0.000932,0.102484,0.076426,0.107736,1.0,0.279413,0.110804,-0.002946,-0.002453,0.03959,...,-0.010458,-0.005244,-0.011531,0.002315,-0.004901,-0.00677,0.000847,0.000677,-0.002006,0.005201
Text_Length,-0.007532,0.056984,0.084958,0.602237,0.279413,1.0,0.032277,-0.003092,0.007051,0.043394,...,0.004088,-0.001325,-0.016318,-0.010335,0.00593,-0.006744,0.005063,-0.011668,-0.011553,-0.011648
"TYPE(1 PHOTO,2 VIDEO)_video",-0.007427,-0.054276,-0.099426,-0.0056,0.110804,0.032277,1.0,0.024033,-0.01368,0.001728,...,-0.003712,-0.004025,-0.005742,-0.002751,-0.006972,-0.005196,0.003549,-0.006972,-0.009002,-0.012081
Post_Timing_Evening,0.001737,-0.055901,0.013546,0.007898,-0.002946,-0.003092,0.024033,1.0,-0.214828,-0.450582,...,0.002726,-0.007202,0.004952,-0.003413,-0.001551,-0.009298,-0.01906,0.003911,0.00505,0.000467
Post_Timing_Morning,-0.015098,0.010759,-0.012141,-0.019475,-0.002453,0.007051,-0.01368,-0.214828,1.0,-0.284635,...,-0.0055,0.008141,0.00174,-0.010176,0.006775,0.013788,0.045456,-0.000553,-0.010176,-0.000958
Post_Timing_Night,-0.015618,0.083858,-0.0259,0.066102,0.03959,0.043394,0.001728,-0.450582,-0.284635,1.0,...,0.009457,-0.001034,-0.003848,0.016719,0.008035,-0.005728,-0.016885,0.003123,-0.002312,0.002573


In [38]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


# Calculate Variance Inflation Factor (VIF) for independent variables
X = data[[
    'FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length']]

# Add a constant term to the model for the intercept
X = add_constant(X)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

              Feature       VIF
0               const  2.685209
1           FOLLOWERS  1.034455
2           FOLLOWING  1.030143
3  Tagged_Users_Count  1.041214
4      number_of_tags  1.609585
5  number_of_mentions  1.106235
6         Text_Length  1.695642


# **Linear Regression for Likes and Comments**

## Regression Analysis for Predicting Likes

In [39]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count','number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September']]
DV = data[['LIKES']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.438
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     590.0
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:36:17   Log-Likelihood:            -1.9194e+05
No. Observations:               19675   AIC:                         3.839e+05
Df Residuals:                   19648   BIC:                         3.841e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Standardized Regression Analysis for Predicting Likes

In [40]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count','number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September']]
DV = data[['LIKES']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)

# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.438
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     590.0
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:36:29   Log-Likelihood:                -22241.
No. Observations:               19675   AIC:                         4.454e+04
Df Residuals:                   19648   BIC:                         4.475e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Regression Analysis for Predicting Comments

In [41]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count','number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September']]
DV = data[['COMMENTS']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     57.08
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          5.40e-286
Time:                        20:36:39   Log-Likelihood:            -1.4731e+05
No. Observations:               19675   AIC:                         2.947e+05
Df Residuals:                   19648   BIC:                         2.949e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Standardized Regression Analysis for Predicting Comments

In [42]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count','number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September']]
DV = data[['COMMENTS']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)

# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     57.08
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          5.40e-286
Time:                        20:36:54   Log-Likelihood:                -27201.
No. Observations:               19675   AIC:                         5.446e+04
Df Residuals:                   19648   BIC:                         5.467e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

# **Creating and Analyzing New Variables**

In [43]:
data_new = data

In [44]:
# Create new features: Count of question marks and count of exclamation marks
data_new['Count_of_question_mark'] = data_new['TEXT'].str.count(r'\?')
data_new['Count_of_exclamation_markt'] = data_new['TEXT'].str.count(r'\!')

In [45]:
# Create new features: Count of keywords for engagement
import re

# List of keywords related to engagement
keywords_for_engagement = [
    'love', 'amazing', 'incredible', 'awesome', 'fantastic', 'beautiful', 'wonderful', 'perfect', 
    'stunning', 'jaw-dropping', 'heartwarming', 'inspiring', 'exciting', 'powerful', 'fun', 
    'comment', 'share', 'like', 'follow', 'tag', 'subscribe', 'join', 'click', 'swipe', 'check', 
    'explore', 'recommend', 'vote', 'participate', 'donate', 'challenge', 'contest', 'win',
    'reply', 'respond', 'engage', 'message', 'interact', 'shoutout', 'mention', 'post', 'update', 
    'buzz', 'reaction', 'spotlight', 'feature', 'giveaway', 'free', 'discount', 'exclusive', 
    'special', 'deal', 'offer', 'price', 'sale', 'coupon', 'access', 'invitation', 'limited',
    'please', 'help', 'support', 'favor', 'contribute', 'spread', 'share', 'like', 'comment', 
    '#love', '#instagood', '#follow', '#likeforlike', '#commentforcomment', '#explorepage', 
    '#foryou', '#igers', '#viral', '#fashion', '#style', '#cute', '#funny', '#fitness', '#motivation'
]

data_new['keywords_for_engagement'] = data_new['TEXT'].apply(lambda x: sum([1 for word in keywords_for_engagement if re.search(r'\b' + re.escape(word) + r'\b', x, re.IGNORECASE)]))
data_new['keywords_for_engagement'].unique()

array([0, 1, 2, 3, 4, 5, 6, 8, 7, 9], dtype=int64)

In [46]:
# Create new features: Count of emojis for engagement
import re

# List of emojis related to engagement
emojis_for_engagement = [
    '❤️', '🧡', '💛', '💚', '💙', '💜', '🖤', '🤍', '💖', '💘', '💝', '💗', '💓', '💞', '💕', 
    '😊', '😁', '😍', '😋', '😃', '🤩', '😎', '🙌', '✨', '🌟', '🌈', '🤗', '💪', '🥳', '🎉', 
    '🥰', '🌸', '👏', '🏆', '🎯', '🥇', '💥', '🔥', '💯', '💬', '🎉', '🎈', '🏅', '🎁', '🏆', 
    '🎶', '🌟', '🥂', '🍾', '😘', '😽', '💋', '🥰', '💖', '💕', '💑', '👩‍❤️‍👩', '👨‍❤️‍👨', 
    '🚀', '🌍', '🌎', '🌏', '🔥', '✨', '📸', '🎥', '🏖️', '📝', '📲', '📢', '🔔', '🛎️', '🔑', 
    '🚨', '🏁', '🆗', '⬇️'
]

# Create a new column that counts the occurrence of any engagement-related emojis in the TEXT column, case-insensitive
data_new['emojis_for_engagement'] = data_new['TEXT'].apply(lambda x: sum([1 for emoji in emojis_for_engagement if emoji in x]))

# Display the updated dataframe with the new column
data_new['emojis_for_engagement'].unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [47]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19675 entries, 0 to 19680
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   USERNAME                     19675 non-null  object        
 1   FOLLOWERS                    19675 non-null  int64         
 2   FOLLOWING                    19675 non-null  int64         
 3   LIKES                        19675 non-null  int64         
 4   COMMENTS                     19675 non-null  int64         
 5   TEXT                         19675 non-null  object        
 6   DATE                         19675 non-null  datetime64[ns]
 7   Tagged_Users_Count           19675 non-null  int32         
 8   LINK                         19675 non-null  object        
 9   number_of_tags               19675 non-null  int64         
 10  number_of_mentions           19675 non-null  int64         
 11  Month                        19675 non-null  i

## Correlation Matrix Analysis for New Variables

In [48]:
correlation_matrix_columns = data_new[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
correlation_matrix = correlation_matrix_columns.corr()
correlation_matrix

Unnamed: 0,FOLLOWERS,FOLLOWING,Tagged_Users_Count,number_of_tags,number_of_mentions,Text_Length,"TYPE(1 PHOTO,2 VIDEO)_video",Post_Timing_Evening,Post_Timing_Morning,Post_Timing_Night,...,Month_Name_January,Month_Name_July,Month_Name_March,Month_Name_May,Month_Name_November,Month_Name_October,Month_Name_September,Count_of_question_mark,keywords_for_engagement,emojis_for_engagement
FOLLOWERS,1.0,-0.093496,0.145211,-0.043321,-0.000932,-0.007532,-0.007427,0.001737,-0.015098,-0.015618,...,0.228232,0.222446,-0.005591,-0.021694,0.222446,0.28722,0.385464,-0.044039,0.002734,-0.009812
FOLLOWING,-0.093496,1.0,0.021503,0.10725,0.102484,0.056984,-0.054276,-0.055901,0.010759,0.083858,...,-0.008688,-0.00546,0.000514,0.014776,-0.00546,-0.00705,-0.009462,0.030003,0.052091,-0.024599
Tagged_Users_Count,0.145211,0.021503,1.0,0.11228,0.076426,0.084958,-0.099426,0.013546,-0.012141,-0.0259,...,0.010644,-0.003118,-0.005147,-0.003442,0.025918,0.021403,0.051475,0.024505,0.067118,0.041293
number_of_tags,-0.043321,0.10725,0.11228,1.0,0.107736,0.602237,-0.0056,0.007898,-0.019475,0.066102,...,-0.01134,0.017416,-0.00352,0.00889,-0.012626,-0.008195,-0.003276,0.133918,0.39667,0.075297
number_of_mentions,-0.000932,0.102484,0.076426,0.107736,1.0,0.279413,0.110804,-0.002946,-0.002453,0.03959,...,0.002315,-0.004901,-0.00677,0.000847,0.000677,-0.002006,0.005201,0.15597,0.094327,0.015842
Text_Length,-0.007532,0.056984,0.084958,0.602237,0.279413,1.0,0.032277,-0.003092,0.007051,0.043394,...,-0.010335,0.00593,-0.006744,0.005063,-0.011668,-0.011553,-0.011648,0.194904,0.326648,0.079328
"TYPE(1 PHOTO,2 VIDEO)_video",-0.007427,-0.054276,-0.099426,-0.0056,0.110804,0.032277,1.0,0.024033,-0.01368,0.001728,...,-0.002751,-0.006972,-0.005196,0.003549,-0.006972,-0.009002,-0.012081,-0.001586,-0.047633,-0.031936
Post_Timing_Evening,0.001737,-0.055901,0.013546,0.007898,-0.002946,-0.003092,0.024033,1.0,-0.214828,-0.450582,...,-0.003413,-0.001551,-0.009298,-0.01906,0.003911,0.00505,0.000467,-0.004083,0.021948,-0.002971
Post_Timing_Morning,-0.015098,0.010759,-0.012141,-0.019475,-0.002453,0.007051,-0.01368,-0.214828,1.0,-0.284635,...,-0.010176,0.006775,0.013788,0.045456,-0.000553,-0.010176,-0.000958,-0.018319,0.017968,0.001938
Post_Timing_Night,-0.015618,0.083858,-0.0259,0.066102,0.03959,0.043394,0.001728,-0.450582,-0.284635,1.0,...,0.016719,0.008035,-0.005728,-0.016885,0.003123,-0.002312,0.002573,0.036754,-0.022956,0.006212


# **Re-run regression with new variables**

## Regression Analysis for Predicting Likes with New Variables

In [49]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data_new[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = data_new[['LIKES']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     512.8
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:43:09   Log-Likelihood:            -1.9192e+05
No. Observations:               19675   AIC:                         3.839e+05
Df Residuals:                   19644   BIC:                         3.842e+05
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Standardized Regression Analysis for Predicting Likes with New Variables

In [50]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data_new[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = data_new[['LIKES']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)

# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.439
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     512.8
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:44:17   Log-Likelihood:                -22228.
No. Observations:               19675   AIC:                         4.452e+04
Df Residuals:                   19644   BIC:                         4.476e+04
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Regression Analysis for Predicting Comments with New Variables

In [51]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data_new[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = data_new[['COMMENTS']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.081
Model:                            OLS   Adj. R-squared:                  0.080
Method:                 Least Squares   F-statistic:                     57.88
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:44:37   Log-Likelihood:            -1.4720e+05
No. Observations:               19675   AIC:                         2.945e+05
Df Residuals:                   19644   BIC:                         2.947e+05
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Standardized Regression Analysis for Predicting Comments with New Variables


In [52]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data_new[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = data_new[['COMMENTS']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)

# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.081
Model:                            OLS   Adj. R-squared:                  0.080
Method:                 Least Squares   F-statistic:                     57.88
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:44:51   Log-Likelihood:                -27084.
No. Observations:               19675   AIC:                         5.423e+04
Df Residuals:                   19644   BIC:                         5.448e+04
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19675 entries, 0 to 19680
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   USERNAME                     19675 non-null  object        
 1   FOLLOWERS                    19675 non-null  int64         
 2   FOLLOWING                    19675 non-null  int64         
 3   LIKES                        19675 non-null  int64         
 4   COMMENTS                     19675 non-null  int64         
 5   TEXT                         19675 non-null  object        
 6   DATE                         19675 non-null  datetime64[ns]
 7   Tagged_Users_Count           19675 non-null  int32         
 8   LINK                         19675 non-null  object        
 9   number_of_tags               19675 non-null  int64         
 10  number_of_mentions           19675 non-null  int64         
 11  Month                        19675 non-null  i

# **Weekend Analysis**

In [54]:
# Extract the day of the week (e.g., Monday, Tuesday) from the 'DATE' column and store it in a new column called 'Day_of_Week'
data['Day_of_Week'] = data['DATE'].dt.day_name()

In [55]:
# Create a new column 'Is_Weekend' to indicate whether the post was made on a weekend (1 for Saturday/Sunday, 0 for weekdays)
data['Is_Weekend'] = data['Day_of_Week'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

In [56]:
data.head()

Unnamed: 0,USERNAME,FOLLOWERS,FOLLOWING,LIKES,COMMENTS,TEXT,DATE,Tagged_Users_Count,LINK,number_of_tags,...,Post_Timing_Evening,Post_Timing_Morning,Post_Timing_Night,"TYPE(1 PHOTO,2 VIDEO)_video",Count_of_question_mark,Count_of_exclamation_markt,keywords_for_engagement,emojis_for_engagement,Day_of_Week,Is_Weekend
0,georgiou82,1134619,915,18560,95,Every day is a #newday and every experience is...,2017-02-11 11:05:10,1,https://www.instagram.com/p/BQXZpSQjiPj,2,...,0,1,0,0,0,0,0,0,Saturday,1
1,georgiou82,1134619,915,17710,86,#goodnight !!! The #weekend is ahead!!! Have a...,2017-02-11 00:33:10,1,https://www.instagram.com/p/BQWRUV6j8b-,3,...,0,0,1,0,0,10,0,0,Saturday,1
2,georgiou82,1134619,915,11959,79,#tb #2007 #stous31dromous #megatv #greece #new...,2017-02-09 15:51:47,3,https://www.instagram.com/p/BQSw2wMj0Bw,7,...,0,0,0,0,0,0,0,0,Thursday,0
3,georgiou82,1134619,915,13608,37,Cold weather brings people together.... #filmi...,2017-02-08 18:21:57,3,https://www.instagram.com/p/BQQdPrLDeRz,2,...,1,0,0,0,0,0,0,0,Wednesday,0
4,georgiou82,1134619,915,15842,62,The youngest hearts might be the oldest souls....,2017-02-08 13:03:14,0,https://www.instagram.com/p/BQP4xY7jAXb,4,...,0,0,0,0,0,0,0,0,Wednesday,0


# **Regression Analysis for Question 3: Weekend vs. Weekday Impact on Likes and Comments**

## Regression Analysis for Predicting Likes with Weekend Indicator

In [58]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night','Is_Weekend','Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September']]
DV = data[['LIKES']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.438
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     730.4
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:46:21   Log-Likelihood:            -1.9194e+05
No. Observations:               19675   AIC:                         3.839e+05
Df Residuals:                   19653   BIC:                         3.841e+05
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Regression Analysis for Predicting Comments with Weekend Indicator

In [59]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = data[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night','Is_Weekend','Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September']]
DV = data[['COMMENTS']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     70.24
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          1.13e-288
Time:                        20:46:23   Log-Likelihood:            -1.4732e+05
No. Observations:               19675   AIC:                         2.947e+05
Df Residuals:                   19653   BIC:                         2.949e+05
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

# **Micro and Macro Influencers Analysis**

In [60]:
# Divide data into micro and macro influencers
micro_influencers = data_new[data_new['FOLLOWERS'] < 50000]
macro_influencers = data_new[data_new['FOLLOWERS'] >= 50000]

## Regression for micro influencers

### Regression Analysis for Predicting Likes (Micro Influencers)

In [61]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = micro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = micro_influencers[['LIKES']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.096
Method:                 Least Squares   F-statistic:                     56.51
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          5.99e-268
Time:                        20:46:36   Log-Likelihood:            -1.1405e+05
No. Observations:               13134   AIC:                         2.282e+05
Df Residuals:                   13108   BIC:                         2.284e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

### Standardized Regression Analysis for Predicting Likes (Micro Influencers)

In [62]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = micro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = micro_influencers[['LIKES']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)
# Fill missing values for specific columns
IVs_with_const['Month_Name_August'] = IVs_with_const['Month_Name_August'].fillna(0)  # Replace NaN with 0
IVs_with_const['Month_Name_July'] = IVs_with_const['Month_Name_July'].fillna(0)
IVs_with_const['Month_Name_November'] = IVs_with_const['Month_Name_November'].fillna(0)
IVs_with_const['Month_Name_September'] = IVs_with_const['Month_Name_September'].fillna(0)
IVs_with_const['Month_Name_October'] = IVs_with_const['Month_Name_October'].fillna(0)
# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.096
Method:                 Least Squares   F-statistic:                     56.51
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          5.99e-268
Time:                        20:46:49   Log-Likelihood:                -17964.
No. Observations:               13134   AIC:                         3.598e+04
Df Residuals:                   13108   BIC:                         3.617e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

### Regression Analysis for Predicting Comments (Micro Influencers)

In [63]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = micro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = micro_influencers[['COMMENTS']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     8.063
Date:                Tue, 10 Dec 2024   Prob (F-statistic):           2.79e-29
Time:                        20:46:49   Log-Likelihood:                -76757.
No. Observations:               13134   AIC:                         1.536e+05
Df Residuals:                   13108   BIC:                         1.538e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

### Standardized Regression Analysis for Predicting Comments (Micro Influencers)

In [64]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = micro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = micro_influencers[['COMMENTS']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)
# Fill missing values for specific columns
IVs_with_const['Month_Name_August'] = IVs_with_const['Month_Name_August'].fillna(0)  # Replace NaN with 0
IVs_with_const['Month_Name_July'] = IVs_with_const['Month_Name_July'].fillna(0)
IVs_with_const['Month_Name_November'] = IVs_with_const['Month_Name_November'].fillna(0)
IVs_with_const['Month_Name_September'] = IVs_with_const['Month_Name_September'].fillna(0)
IVs_with_const['Month_Name_October'] = IVs_with_const['Month_Name_October'].fillna(0)
# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     8.063
Date:                Tue, 10 Dec 2024   Prob (F-statistic):           2.79e-29
Time:                        20:46:50   Log-Likelihood:                -18536.
No. Observations:               13134   AIC:                         3.712e+04
Df Residuals:                   13108   BIC:                         3.732e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Regression for macro influencers

### Regression Analysis for Predicting Likes (Macro Influencers)

In [65]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = macro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = macro_influencers[['LIKES']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.421
Model:                            OLS   Adj. R-squared:                  0.419
Method:                 Least Squares   F-statistic:                     163.3
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:46:50   Log-Likelihood:                -66942.
No. Observations:                6541   AIC:                         1.339e+05
Df Residuals:                    6511   BIC:                         1.341e+05
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

### Standardized Regression Analysis for Predicting Likes (Macro Influencers)

In [66]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = macro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = macro_influencers[['LIKES']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)
# Fill missing values for specific columns
IVs_with_const['Month_Name_August'] = IVs_with_const['Month_Name_August'].fillna(0)  # Replace NaN with 0
IVs_with_const['Month_Name_July'] = IVs_with_const['Month_Name_July'].fillna(0)
IVs_with_const['Month_Name_November'] = IVs_with_const['Month_Name_November'].fillna(0)
IVs_with_const['Month_Name_September'] = IVs_with_const['Month_Name_September'].fillna(0)
IVs_with_const['Month_Name_October'] = IVs_with_const['Month_Name_October'].fillna(0)
IVs_with_const['Month_Name_March'] = IVs_with_const['Month_Name_March'].fillna(0)

# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  LIKES   R-squared:                       0.421
Model:                            OLS   Adj. R-squared:                  0.419
Method:                 Least Squares   F-statistic:                     163.3
Date:                Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                        20:46:51   Log-Likelihood:                -7493.3
No. Observations:                6541   AIC:                         1.505e+04
Df Residuals:                    6511   BIC:                         1.525e+04
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

### Regression Analysis for Predicting Comments (Macro Influencers)

In [67]:
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Define the dependent variable (DV) and independent variables (IVs)
IVs = macro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = macro_influencers[['COMMENTS']]

# Add a constant to the model (for the intercept)
IVs_with_const = sm.add_constant(IVs)

# Fit the linear regression model
model = sm.OLS(DV, IVs_with_const).fit()

# Get the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.116
Method:                 Least Squares   F-statistic:                     30.66
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          4.59e-157
Time:                        20:46:53   Log-Likelihood:                -52297.
No. Observations:                6541   AIC:                         1.047e+05
Df Residuals:                    6511   BIC:                         1.049e+05
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

### Standardized Regression Analysis for Predicting Comments (Micro Influencers)

In [68]:
from scipy.stats.mstats import zscore

# Define the dependent variable (DV) and independent variables (IVs)
IVs = macro_influencers[['FOLLOWERS', 'FOLLOWING', 'Tagged_Users_Count', 'number_of_tags', 'number_of_mentions', 'Text_Length', 'TYPE(1 PHOTO,2 VIDEO)_video',
                                            'Post_Timing_Evening', 'Post_Timing_Morning', 'Post_Timing_Night', 'Day_of_Week_Monday','Day_of_Week_Saturday',
                                            'Day_of_Week_Sunday', 'Day_of_Week_Thursday', 'Day_of_Week_Wednesday', 'Day_of_Week_Tuesday', 'Month_Name_August',
                                            'Month_Name_December', 'Month_Name_February', 'Month_Name_January', 'Month_Name_July', 'Month_Name_March',
                                            'Month_Name_May', 'Month_Name_November', 'Month_Name_October', 'Month_Name_September', 'Count_of_exclamation_markt','Count_of_question_mark', 'keywords_for_engagement', 'emojis_for_engagement']]
DV = macro_influencers[['COMMENTS']]

# Standardize the dependent variable (DV) and independent variables (IVs)
DV_standardized = zscore(DV)
IVs_standardized = zscore(IVs)

# Add a constant to the standardized independent variables
IVs_with_const = sm.add_constant(IVs_standardized)
# Fill missing values for specific columns

IVs_with_const['Month_Name_March'] = IVs_with_const['Month_Name_March'].fillna(0)

# Fit the OLS model using Statsmodels
standardized_model = sm.OLS(DV_standardized, IVs_with_const).fit()

# Get the summary of the regression results
print(standardized_model.summary())

                            OLS Regression Results                            
Dep. Variable:               COMMENTS   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.116
Method:                 Least Squares   F-statistic:                     30.66
Date:                Tue, 10 Dec 2024   Prob (F-statistic):          4.59e-157
Time:                        20:46:53   Log-Likelihood:                -8862.6
No. Observations:                6541   AIC:                         1.779e+04
Df Residuals:                    6511   BIC:                         1.799e+04
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             