In [74]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import os

In [2]:
def create_regressor_columns_string(columns):
  # Join all independent variables to define the formula used by the model
  regressor_columns = list(filter(lambda x: x != 'impression_count',columns))
  regressor_columns_string = "+".join(regressor_columns)
  return regressor_columns_string

In [79]:
def regression(path):
    # Load data
    regression_df_pd = pd.read_csv(path)
    regression_df_pd = regression_df_pd.drop('tweet_text', axis=1)
    
    # Apply log transformation to independent variables
    for var in regression_df_pd.columns:
            regression_df_pd[var] = regression_df_pd[var].apply(lambda x: np.log(1+x))

    # Define formula without cross terms
    regressor_columns_string = create_regressor_columns_string(regression_df_pd.columns)

    # Fit model
    mod = smf.ols(formula=f'impression_count ~ {regressor_columns_string}', data=regression_df_pd)
    res = mod.fit()

    # Get p-values
    p_values = res.pvalues
    nb_variables_before = len(p_values)
    
    # Filter p-values < 0.05
    p_values = p_values[p_values < 0.05]
    nb_variables_after = len(p_values)

    # Get significant variables
    significant_variables = list(p_values.index)
    # Remove intercept if significant
    try:
        significant_variables.remove('Intercept')
    except:
        pass

    print(f'Number of discarded variables: {nb_variables_before - nb_variables_after}')
    print(f'Significant variables ({len(significant_variables)}): {significant_variables}')

    # Perform new regression with significant variables
    regressor_columns_string_new = create_regressor_columns_string(significant_variables)
    mod_new = smf.ols(formula=f'impression_count ~ {regressor_columns_string_new}', data=regression_df_pd)
    res_new = mod_new.fit()

    # Save res as html
    res_html = res.summary().as_html()
    name = path.split('/')[-1].split('.')[0][:-3]
    path = os.path.join("../data/regression/html_regression", name, ".html")
    with open(path, 'w') as f:
        f.write(res_html)

    # Save res_new as html
    res_new_html = res_new.summary().as_html()
    path_opti = os.path.join("../data/regression/html_regression", name+"_opti.html")
    with open(path_opti, 'w') as f:
        f.write(res_html)

    return res, res_new

In [80]:
path_french_celebrities = '../data/regression/french_celebrities_regression_df.csv'
res_french_celebrities = regression(path_french_celebrities)

display(res_french_celebrities[0].summary())
display(res_french_celebrities[1].summary())

Number of discarded variables: 10
Significant variables (14): ['dummy_entertainment', 'dummy_entertainmentleisurebusiness', 'dummy_francepolitics', 'dummy_jeanlucmlenchon', 'dummy_nikosaliagas', 'dummy_politicalfigures', 'dummy_staderennaisfc', 'hashtags_count', 'mentions_count', 'tweet_external_urls_count', 'tweet_length', 'tweet_medias_count', 'tweet_sentiment', 'followers_count']


FileNotFoundError: [Errno 2] No such file or directory: '../data/regression/html_regression/french_celebrities_regression/.html'

In [63]:
path_us_celebrities = '../data/regression/american_celebrities_regression_df.csv'
res_french_celebrities = regression(path_us_celebrities)

display(res_french_celebrities[0].summary())
display(res_french_celebrities[1].summary())

Number of discarded variables: 19
Significant variables (36): ['dummy_americanfootball', 'dummy_barstoolsports', 'dummy_businesspersonalities', 'dummy_celebrities', 'dummy_competitionshows', 'dummy_entertainment', 'dummy_events', 'dummy_famouscomedians', 'dummy_gamingbusiness', 'dummy_jaketapper', 'dummy_johncusack', 'dummy_kenjeong', 'dummy_lebronjames', 'dummy_music', 'dummy_nba', 'dummy_nbabasketball', 'dummy_nbaplayers', 'dummy_news', 'dummy_pftcommenter', 'dummy_politics', 'dummy_pop', 'dummy_sportsfigures', 'dummy_talkshows', 'dummy_technologybusiness', 'dummy_thomassanders', 'dummy_tvmoviesrelatedentertainment', 'dummy_tweet_period_afternoon', 'dummy_tweet_period_night', 'dummy_wizkhalifa', 'dummy_youtubers', 'mentions_count', 'tweet_external_urls_count', 'tweet_length', 'tweet_medias_count', 'tweet_sentiment', 'followers_count']


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.549
Model:,OLS,Adj. R-squared:,0.548
Method:,Least Squares,F-statistic:,603.0
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,16:08:01,Log-Likelihood:,-73150.0
No. Observations:,26825,AIC:,146400.0
Df Residuals:,26770,BIC:,146900.0
Df Model:,54,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0914,0.275,3.975,0.000,0.553,1.630
dummy_americanfootball,2.6065,0.228,11.408,0.000,2.159,3.054
dummy_barstoolsports,6.1525,1.783,3.451,0.001,2.658,9.647
dummy_basketball,0.4776,0.365,1.310,0.190,-0.237,1.192
dummy_businessfinance,-0.2959,0.621,-0.477,0.634,-1.513,0.921
dummy_businesspersonalities,1.3596,0.241,5.643,0.000,0.887,1.832
dummy_celebrities,2.0258,0.151,13.452,0.000,1.731,2.321
dummy_competitionshows,0.7256,0.338,2.144,0.032,0.062,1.389
dummy_digitalcreators,-0.0541,0.637,-0.085,0.932,-1.303,1.195

0,1,2,3
Omnibus:,51.717,Durbin-Watson:,1.436
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.333
Skew:,0.101,Prob(JB):,4.32e-12
Kurtosis:,3.08,Cond. No.,1390000000000000.0


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.546
Model:,OLS,Adj. R-squared:,0.546
Method:,Least Squares,F-statistic:,895.3
Date:,"Sun, 14 May 2023",Prob (F-statistic):,0.0
Time:,16:08:01,Log-Likelihood:,-73229.0
No. Observations:,26825,AIC:,146500.0
Df Residuals:,26788,BIC:,146800.0
Df Model:,36,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5314,0.311,1.708,0.088,-0.078,1.141
dummy_americanfootball,2.5510,0.200,12.779,0.000,2.160,2.942
dummy_barstoolsports,6.1596,1.787,3.447,0.001,2.658,9.662
dummy_businesspersonalities,2.3488,0.206,11.414,0.000,1.945,2.752
dummy_celebrities,1.8885,0.145,13.044,0.000,1.605,2.172
dummy_competitionshows,0.9518,0.201,4.732,0.000,0.558,1.346
dummy_entertainment,-0.9081,0.140,-6.504,0.000,-1.182,-0.634
dummy_events,0.5926,0.190,3.116,0.002,0.220,0.965
dummy_famouscomedians,0.4533,0.136,3.343,0.001,0.188,0.719

0,1,2,3
Omnibus:,40.976,Durbin-Watson:,1.432
Prob(Omnibus):,0.0,Jarque-Bera (JB):,41.164
Skew:,0.093,Prob(JB):,1.15e-09
Kurtosis:,3.048,Cond. No.,1760.0
