In [74]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import os

In [2]:
def create_regressor_columns_string(columns):
  # Join all independent variables to define the formula used by the model
  regressor_columns = list(filter(lambda x: x != 'impression_count',columns))
  regressor_columns_string = "+".join(regressor_columns)
  return regressor_columns_string

In [121]:
def perform_regression(regression_df, significant_variables):
    # Define formula without cross terms
    regressor_columns_string = create_regressor_columns_string(significant_variables)
    # Fit model
    mod = smf.ols(formula=f'impression_count ~ {regressor_columns_string}', data=regression_df)
    res = mod.fit()
    return res

In [122]:
def keep_significant_var(p_values):
    # Filter p-values < 0.05
    p_values = p_values[p_values < 0.05]

    # Get significant variables
    significant_variables = list(p_values.index)
    # Remove intercept if significant
    try:
        significant_variables.remove('Intercept')
    except:
        pass

    return significant_variables

In [138]:
def regression(path, threshold=1e2):
    # Load data
    regression_df_pd = pd.read_csv(path)
    regression_df_pd = regression_df_pd.drop('tweet_text', axis=1)
    
    #======================#
    #   FIRST REGRESSION   #
    #======================#
    significant_variables = list(regression_df_pd.columns)
    res = perform_regression(regression_df_pd, significant_variables)
    p_values = res.pvalues
    nb_variables_before = len(p_values)
    
    #======================#
    #   SECOND REGRESSION  #
    #======================#
    # Filter out data if impression_count < threshold
    regression_df_pd = regression_df_pd[regression_df_pd['impression_count'] >= threshold].copy()

    # Perform new regression with significant variables
    res_threshold = perform_regression(regression_df_pd, significant_variables)

    #======================#
    #   THIRD REGRESSION   #
    #======================#
    # Apply log transformation to independent variables
    for var in regression_df_pd.columns:
            regression_df_pd[var] = regression_df_pd[var].apply(lambda x: np.log(1+x))

    # Perform new regression with significant variables
    res_log = perform_regression(regression_df_pd, significant_variables)
    p_values = res_log.pvalues

    #======================#
    #   FINAL RESULTS      #
    #======================#
    # Filter p-values < 0.05
    significant_variables = keep_significant_var(p_values)
    nb_variables_after = len(significant_variables)

    # Perform new regression with significant variables
    res_final = perform_regression(regression_df_pd, significant_variables)

    print(f'Number of discarded variables: {nb_variables_before - nb_variables_after}')
    print(f'Significant variables ({len(significant_variables)}): {significant_variables}')

    #======================#
    #   SAVE AS HTML       #
    #======================#
    # Save first res as html
    res_html = res.summary().as_html()
    name = path.split('/')[-1].split('.')[0][:-3]
    path_root = os.path.join(os.getcwd(), "data", "regression", "html_regression")
    path = os.path.join(path_root, name+".html")
    with open(path, 'w') as f:
        f.write(res_html)

    # Save second res as html
    res_thresh_html = res_threshold.summary().as_html()
    path_opti = os.path.join(path_root, name+"_threshold.html")
    with open(path_opti, 'w') as f:
        f.write(res_thresh_html)

    # Save third res as html
    res_log_html = res_log.summary().as_html()
    path_log = os.path.join(path_root, name+"_log.html")
    with open(path_log, 'w') as f:
        f.write(res_log_html)

    # Save final res as html
    res_final_html = res_final.summary().as_html()
    path_final = os.path.join(path_root, name+"_final.html")
    with open(path_final, 'w') as f:
        f.write(res_final_html)

    return res, res_threshold, res_log, res_final

In [139]:
path_french_celebrities = '../data/regression/french_celebrities_regression_df.csv'
res_french_celebrities = regression(path_french_celebrities)

display(res_french_celebrities[0].summary())
display(res_french_celebrities[1].summary())
display(res_french_celebrities[2].summary())
display(res_french_celebrities[3].summary())

Number of discarded variables: 9
Significant variables (15): ['dummy_entertainmentleisurebusiness', 'dummy_jeanlucmlenchon', 'dummy_nikosaliagas', 'dummy_politicalfigures', 'dummy_soccer', 'dummy_sportsfitnessbusiness', 'dummy_tweet_period_afternoon', 'dummy_tweet_period_morning', 'dummy_tweet_period_night', 'hashtags_count', 'mentions_count', 'tweet_external_urls_count', 'tweet_length', 'tweet_medias_count', 'followers_count']


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.183
Model:,OLS,Adj. R-squared:,0.182
Method:,Least Squares,F-statistic:,172.6
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,17:29:48,Log-Likelihood:,-221800.0
No. Observations:,16151,AIC:,443700.0
Df Residuals:,16129,BIC:,443800.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.71e+04,3213.965,-5.321,0.000,-2.34e+04,-1.08e+04
dummy_celebrities,1.24e+04,2.89e+04,0.429,0.668,-4.42e+04,6.9e+04
dummy_entertainment,-3.593e+04,8510.562,-4.222,0.000,-5.26e+04,-1.92e+04
dummy_entertainmentleisurebusiness,2.427e+04,7351.944,3.302,0.001,9862.635,3.87e+04
dummy_francepolitics,8.179e+04,1.19e+04,6.879,0.000,5.85e+04,1.05e+05
dummy_jeanlucmlenchon,-1.865e+05,1.03e+04,-18.111,0.000,-2.07e+05,-1.66e+05
dummy_nikosaliagas,-1.017e+05,7631.240,-13.326,0.000,-1.17e+05,-8.67e+04
dummy_politicalfigures,-6093.0430,9699.898,-0.628,0.530,-2.51e+04,1.29e+04
dummy_soccer,1.282e+05,2.17e+04,5.903,0.000,8.57e+04,1.71e+05

0,1,2,3
Omnibus:,42710.957,Durbin-Watson:,1.413
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1471275351.624
Skew:,31.333,Prob(JB):,0.0
Kurtosis:,1480.279,Cond. No.,9.06e+19


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.256
Model:,OLS,Adj. R-squared:,0.253
Method:,Least Squares,F-statistic:,116.5
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,17:29:48,Log-Likelihood:,-100730.0
No. Observations:,7151,AIC:,201500.0
Df Residuals:,7129,BIC:,201700.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1794.5796,6871.938,-0.261,0.794,-1.53e+04,1.17e+04
dummy_celebrities,6406.9282,7.13e+04,0.090,0.928,-1.33e+05,1.46e+05
dummy_entertainment,-2.613e+04,2.06e+04,-1.268,0.205,-6.65e+04,1.43e+04
dummy_entertainmentleisurebusiness,3.112e+04,1.8e+04,1.724,0.085,-4260.464,6.65e+04
dummy_francepolitics,7.381e+04,2.43e+04,3.033,0.002,2.61e+04,1.22e+05
dummy_jeanlucmlenchon,-2.003e+05,2.01e+04,-9.952,0.000,-2.4e+05,-1.61e+05
dummy_nikosaliagas,-1.658e+05,1.94e+04,-8.542,0.000,-2.04e+05,-1.28e+05
dummy_politicalfigures,3800.6064,2.03e+04,0.187,0.852,-3.6e+04,4.36e+04
dummy_soccer,2.278e+05,4.49e+04,5.075,0.000,1.4e+05,3.16e+05

0,1,2,3
Omnibus:,16283.414,Durbin-Watson:,1.512
Prob(Omnibus):,0.0,Jarque-Bera (JB):,149286541.979
Skew:,21.629,Prob(JB):,0.0
Kurtosis:,709.513,Cond. No.,9.5e+20


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.615
Model:,OLS,Adj. R-squared:,0.613
Method:,Least Squares,F-statistic:,541.1
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,17:29:48,Log-Likelihood:,-11870.0
No. Observations:,7151,AIC:,23780.0
Df Residuals:,7129,BIC:,23940.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.3186,0.178,-18.680,0.000,-3.667,-2.970
dummy_celebrities,-0.1320,0.413,-0.320,0.749,-0.941,0.677
dummy_entertainment,0.1190,0.120,0.995,0.320,-0.115,0.354
dummy_entertainmentleisurebusiness,0.7316,0.105,6.991,0.000,0.526,0.937
dummy_francepolitics,0.0567,0.138,0.409,0.682,-0.215,0.328
dummy_jeanlucmlenchon,-0.7481,0.124,-6.047,0.000,-0.991,-0.506
dummy_nikosaliagas,-1.9281,0.123,-15.739,0.000,-2.168,-1.688
dummy_politicalfigures,0.6004,0.118,5.093,0.000,0.369,0.831
dummy_soccer,0.7371,0.259,2.848,0.004,0.230,1.244

0,1,2,3
Omnibus:,179.116,Durbin-Watson:,1.416
Prob(Omnibus):,0.0,Jarque-Bera (JB):,294.213
Skew:,0.232,Prob(JB):,1.3e-64
Kurtosis:,3.879,Cond. No.,1.36e+16


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.614
Model:,OLS,Adj. R-squared:,0.613
Method:,Least Squares,F-statistic:,810.0
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,17:29:48,Log-Likelihood:,-11877.0
No. Observations:,7151,AIC:,23780.0
Df Residuals:,7136,BIC:,23890.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.1673,0.165,-19.152,0.000,-3.491,-2.843
dummy_entertainmentleisurebusiness,0.7801,0.085,9.200,0.000,0.614,0.946
dummy_jeanlucmlenchon,-0.6761,0.115,-5.881,0.000,-0.901,-0.451
dummy_nikosaliagas,-1.8192,0.118,-15.464,0.000,-2.050,-1.589
dummy_politicalfigures,0.6753,0.073,9.259,0.000,0.532,0.818
dummy_soccer,0.6963,0.155,4.503,0.000,0.393,0.999
dummy_sportsfitnessbusiness,0.3082,0.143,2.148,0.032,0.027,0.589
dummy_tweet_period_afternoon,-0.7250,0.050,-14.603,0.000,-0.822,-0.628
dummy_tweet_period_morning,-0.7320,0.051,-14.292,0.000,-0.832,-0.632

0,1,2,3
Omnibus:,175.389,Durbin-Watson:,1.411
Prob(Omnibus):,0.0,Jarque-Bera (JB):,296.625
Skew:,0.218,Prob(JB):,3.88e-65
Kurtosis:,3.898,Cond. No.,6410000000000000.0
