In [7]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

In [2]:
def create_regressor_columns_string(columns):
  # Join all independent variables to define the formula used by the model
  regressor_columns = list(filter(lambda x: x != 'impression_count',columns))
  regressor_columns_string = "+".join(regressor_columns)
  return regressor_columns_string

In [38]:
def regression(path):
    # Load data
    regression_df_pd = pd.read_csv(path)
    regression_df_pd = regression_df_pd.drop('tweet_text', axis=1)
    
    # Apply log transformation to dependent variable
    regression_df_pd['impression_count'] = regression_df_pd['impression_count'].apply(lambda x: np.log(1+x))

    # Define formula without cross terms
    regressor_columns_string = create_regressor_columns_string(regression_df_pd.columns)

    # Fit model
    mod = smf.ols(formula=f'impression_count ~ {regressor_columns_string}', data=regression_df_pd)
    res = mod.fit()

    # Get p-values
    p_values = res.pvalues
    nb_variables_before = len(p_values)
    
    # Filter p-values < 0.05
    p_values = p_values[p_values < 0.05]
    nb_variables_after = len(p_values)

    # Get significant variables
    significant_variables = list(p_values.index)
    # Remove intercept
    significant_variables.remove('Intercept')

    print(f'Number of discarded variables: {nb_variables_before - nb_variables_after}')
    print(f'Significant variables ({len(significant_variables)}): {significant_variables}')

    # Apply log transformation to significant variables if "dummy" is not in the name
    for var in significant_variables:
        if 'dummy' not in var:
            regression_df_pd[var] = regression_df_pd[var].apply(lambda x: np.log(1+x))

    # Perform new regression with significant variables
    regressor_columns_string_new = create_regressor_columns_string(significant_variables)
    mod_new = smf.ols(formula=f'impression_count ~ {regressor_columns_string_new}', data=regression_df_pd)
    res_new = mod_new.fit()

    return res, res_new

In [39]:
path_frech_celebrities = '../data/regression/french_celebrities_regression_df.csv'
res_french_celebrities = regression(path_frech_celebrities)

display(res_french_celebrities[0].summary())
display(res_french_celebrities[1].summary())

Number of discarded variables: 8
Significant variables (15): ['dummy_entertainmentleisurebusiness', 'dummy_francepolitics', 'dummy_jeanlucmlenchon', 'dummy_nikosaliagas', 'dummy_staderennaisfc', 'dummy_tweet_period_afternoon', 'dummy_tweet_period_morning', 'dummy_tweet_period_night', 'hashtags_count', 'mentions_count', 'tweet_external_urls_count', 'tweet_length', 'tweet_medias_count', 'tweet_sentiment', 'followers_count']


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.397
Model:,OLS,Adj. R-squared:,0.396
Method:,Least Squares,F-statistic:,505.5
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,15:49:20,Log-Likelihood:,-42875.0
No. Observations:,16151,AIC:,85790.0
Df Residuals:,16129,BIC:,85960.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4278,0.050,28.773,0.000,1.331,1.525
dummy_celebrities,-0.2276,0.446,-0.510,0.610,-1.102,0.647
dummy_entertainment,-0.1144,0.131,-0.871,0.384,-0.372,0.143
dummy_entertainmentleisurebusiness,-0.3983,0.114,-3.508,0.000,-0.621,-0.176
dummy_francepolitics,0.5358,0.184,2.919,0.004,0.176,0.896
dummy_jeanlucmlenchon,-0.3570,0.159,-2.245,0.025,-0.669,-0.045
dummy_nikosaliagas,-1.3781,0.118,-11.696,0.000,-1.609,-1.147
dummy_politicalfigures,-0.1088,0.150,-0.726,0.468,-0.402,0.185
dummy_soccer,-0.3641,0.335,-1.086,0.278,-1.022,0.293

0,1,2,3
Omnibus:,754.685,Durbin-Watson:,1.418
Prob(Omnibus):,0.0,Jarque-Bera (JB):,409.995
Skew:,0.228,Prob(JB):,9.35e-90
Kurtosis:,2.366,Cond. No.,9.06e+19


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.442
Model:,OLS,Adj. R-squared:,0.441
Method:,Least Squares,F-statistic:,911.5
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,15:49:20,Log-Likelihood:,-42253.0
No. Observations:,16151,AIC:,84540.0
Df Residuals:,16136,BIC:,84650.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.974e+11,4.54e+11,-0.876,0.381,-1.29e+12,4.92e+11
dummy_entertainmentleisurebusiness,-0.2885,0.102,-2.835,0.005,-0.488,-0.089
dummy_francepolitics,1.0016,0.111,9.018,0.000,0.784,1.219
dummy_jeanlucmlenchon,-0.7716,0.158,-4.878,0.000,-1.082,-0.462
dummy_nikosaliagas,-1.4667,0.119,-12.364,0.000,-1.699,-1.234
dummy_staderennaisfc,4.0929,0.124,33.038,0.000,3.850,4.336
dummy_tweet_period_afternoon,3.974e+11,4.54e+11,0.876,0.381,-4.92e+11,1.29e+12
dummy_tweet_period_morning,3.974e+11,4.54e+11,0.876,0.381,-4.92e+11,1.29e+12
dummy_tweet_period_night,3.974e+11,4.54e+11,0.876,0.381,-4.92e+11,1.29e+12

0,1,2,3
Omnibus:,268.185,Durbin-Watson:,1.424
Prob(Omnibus):,0.0,Jarque-Bera (JB):,261.841
Skew:,0.284,Prob(JB):,1.3900000000000002e-57
Kurtosis:,2.742,Cond. No.,470000000000000.0


In [57]:
regression_stats_table = res.summary().tables[1].data[1:]
header = res.summary().tables[1].data[0]

regression_stats_table = pd.DataFrame(regression_stats_table, columns=header)
regression_stats_table['coef'] = regression_stats_table['coef'].astype(float)
regression_stats_table['P>|t|'] = regression_stats_table['P>|t|'].astype(float)
regression_stats_table_significant = regression_stats_table[regression_stats_table['P>|t|'] < 0.05]
significant_regression_columns = regression_stats_table[''].values.tolist()[1:]

['dummy_celebrities',
 'dummy_entertainment',
 'dummy_entertainmentleisurebusiness',
 'dummy_francepolitics',
 'dummy_jeanlucmlenchon',
 'dummy_nikosaliagas',
 'dummy_politicalfigures',
 'dummy_soccer',
 'dummy_sports',
 'dummy_sportsfitnessbusiness',
 'dummy_staderennaisfc',
 'dummy_tvhosts',
 'dummy_tvstars',
 'dummy_tweet_period_afternoon',
 'dummy_tweet_period_morning',
 'dummy_tweet_period_night',
 'hashtags_count',
 'mentions_count',
 'tweet_external_urls_count',
 'tweet_length',
 'tweet_medias_count',
 'tweet_sentiment',
 'followers_count']

In [None]:
regressor_columns = list(filter(lambda x: x not in [
                         'tweet_external_urls_count', 'dummy_tweet_period_night', 'dummy_joemanchin', 'dummy_chrismurphy', 'dummy_financialservicesbusiness', 'dummy_inflationintheunitedstates', 'dummy_joebiden', 'dummy_politicalfigures', 'dummy_northcarolina', 'dummy_tedcruz', 'dummy_sportsfitnessbusiness', 'dummy_unitedstatescongress', 'dummy_markwarner', 'dummy_politicalnews', 'dummy_chriscoons'], regression_df_pd.columns))

regressor_columns_string = create_regressor_columns_string(
    regressor_columns)
mod = smf.ols(
    formula=f'impression_count ~ {regressor_columns_string}', data=regression_df_pd)
res = mod.fit()
print(res.summary())