# REGRESSION

In [35]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set_style('darkgrid')

## HELPER FUNCTIONS

In [2]:
def create_regressor_columns_string(columns):
  # Join all independent variables to define the formula used by the model
  regressor_columns = list(filter(lambda x: x != 'impression_count',columns))
  regressor_columns_string = "+".join(regressor_columns)
  return regressor_columns_string

In [3]:
def perform_regression(regression_df, significant_variables):
    # Define formula without cross terms
    regressor_columns_string = create_regressor_columns_string(significant_variables)
    # Fit model
    mod = smf.ols(formula=f'impression_count ~ {regressor_columns_string}', data=regression_df)
    res = mod.fit()
    return res

In [None]:
def keep_significant_var(p_values):
    # Filter p-values < 0.05
    p_values = p_values[p_values < 0.05]

    # Get significant variables
    significant_variables = list(p_values.index)
    # Remove intercept if significant
    try:
        significant_variables.remove('Intercept')
    except:
        pass

    return significant_variables

## REGRESSIONS WITH LOG TRANSFORMED VARIABLES AND THRESHOLDING

In [8]:
def regression(path, threshold=1e2):
    # Load data
    regression_df_pd = pd.read_csv(path)
    regression_df_pd = regression_df_pd.drop('tweet_text', axis=1)
    
    #======================#
    #   FIRST REGRESSION   #
    #======================#
    significant_variables = list(regression_df_pd.columns)
    res = perform_regression(regression_df_pd, significant_variables)
    p_values = res.pvalues
    nb_variables_before = len(p_values)
    
    #======================#
    #   SECOND REGRESSION  #
    #======================#
    # Filter out data if impression_count < threshold
    regression_df_pd = regression_df_pd[regression_df_pd['impression_count'] >= threshold].copy()

    # Perform new regression with significant variables
    res_threshold = perform_regression(regression_df_pd, significant_variables)

    #======================#
    #   THIRD REGRESSION   #
    #======================#
    # Apply log transformation to independent variables
    for var in regression_df_pd.columns:
            regression_df_pd[var] = regression_df_pd[var].apply(lambda x: np.log(1+x))

    # Perform new regression with significant variables
    res_log = perform_regression(regression_df_pd, significant_variables)
    p_values = res_log.pvalues

    #======================#
    #   FINAL RESULTS      #
    #======================#
    # Filter p-values < 0.05
    significant_variables = keep_significant_var(p_values)
    nb_variables_after = len(significant_variables)

    # Perform new regression with significant variables
    res_final = perform_regression(regression_df_pd, significant_variables)

    print(f'Number of discarded variables: {nb_variables_before - nb_variables_after}')
    print(f'Significant variables ({len(significant_variables)}): {significant_variables}')

    #======================#
    #   SAVE AS HTML       #
    #======================#
    # Save first res as html
    res_html = res.summary().as_html()
    name = path.split('/')[-1].split('.')[0][:-3]
    path_root = os.path.join('/'.join(os.getcwd().split('/')[:-1]), "data", "regression", "html_regression")
    path = os.path.join(path_root, name+".html")
    with open(path, 'w') as f:
        f.write(res_html)

    # Save second res as html
    res_thresh_html = res_threshold.summary().as_html()
    path_opti = os.path.join(path_root, name+"_threshold.html")
    with open(path_opti, 'w') as f:
        f.write(res_thresh_html)

    # Save third res as html
    res_log_html = res_log.summary().as_html()
    path_log = os.path.join(path_root, name+"_log.html")
    with open(path_log, 'w') as f:
        f.write(res_log_html)

    # Save final res as html
    res_final_html = res_final.summary().as_html()
    path_final = os.path.join(path_root, name+"_final.html")
    with open(path_final, 'w') as f:
        f.write(res_final_html)

    return res, res_threshold, res_log, res_final

In [14]:
path_data = '../data/regression/american_celebrities_regression_df.csv'
res_reg = regression(path_data)

display(res_reg[0].summary())
display(res_reg[1].summary())
display(res_reg[2].summary())
display(res_reg[3].summary())

Number of discarded variables: 22
Significant variables (34): ['dummy_americanfootball', 'dummy_barstoolsports', 'dummy_basketball', 'dummy_businesspersonalities', 'dummy_competitionshows', 'dummy_events', 'dummy_foodbeveragebusiness', 'dummy_gamingbusiness', 'dummy_johncusack', 'dummy_kenjeong', 'dummy_lebronjames', 'dummy_moviestv', 'dummy_nbaplayers', 'dummy_pftcommenter', 'dummy_politicalfigures', 'dummy_pop', 'dummy_sports', 'dummy_sportsfigures', 'dummy_sportsicons', 'dummy_techpersonalities', 'dummy_television', 'dummy_thomassanders', 'dummy_tvmoviesrelatedentertainment', 'dummy_tweet_period_afternoon', 'dummy_tweet_period_morning', 'dummy_tweet_period_night', 'dummy_wizkhalifa', 'dummy_youtubers', 'hashtags_count', 'mentions_count', 'tweet_external_urls_count', 'tweet_length', 'tweet_medias_count', 'followers_count']


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.124
Model:,OLS,Adj. R-squared:,0.122
Method:,Least Squares,F-statistic:,70.14
Date:,"Thu, 25 May 2023",Prob (F-statistic):,0.0
Time:,09:19:41,Log-Likelihood:,-438310.0
No. Observations:,26825,AIC:,876700.0
Df Residuals:,26770,BIC:,877200.0
Df Model:,54,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.923e+04,4.11e+04,2.413,0.016,1.86e+04,1.8e+05
dummy_americanfootball,7.472e+05,1.29e+05,5.798,0.000,4.95e+05,1e+06
dummy_barstoolsports,1.208e+06,1.01e+06,1.197,0.231,-7.69e+05,3.18e+06
dummy_basketball,-5.867e+04,2.06e+05,-0.284,0.776,-4.63e+05,3.46e+05
dummy_businessfinance,-2.551e+05,3.52e+05,-0.726,0.468,-9.44e+05,4.34e+05
dummy_businesspersonalities,-3.883e+05,1.36e+05,-2.852,0.004,-6.55e+05,-1.21e+05
dummy_celebrities,-2.402e+05,8.44e+04,-2.846,0.004,-4.06e+05,-7.48e+04
dummy_competitionshows,-1.92e+05,1.91e+05,-1.004,0.315,-5.67e+05,1.83e+05
dummy_digitalcreators,-2.685e+05,3.61e+05,-0.745,0.456,-9.75e+05,4.38e+05

0,1,2,3
Omnibus:,56983.358,Durbin-Watson:,1.926
Prob(Omnibus):,0.0,Jarque-Bera (JB):,340487068.426
Skew:,18.603,Prob(JB):,0.0
Kurtosis:,553.677,Cond. No.,1.27e+18


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.137
Model:,OLS,Adj. R-squared:,0.134
Method:,Least Squares,F-statistic:,49.07
Date:,"Thu, 25 May 2023",Prob (F-statistic):,0.0
Time:,09:19:41,Log-Likelihood:,-278370.0
No. Observations:,16812,AIC:,556900.0
Df Residuals:,16757,BIC:,557300.0
Df Model:,54,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.454e+05,6.42e+04,2.266,0.023,1.96e+04,2.71e+05
dummy_americanfootball,7.251e+05,1.89e+05,3.835,0.000,3.55e+05,1.1e+06
dummy_barstoolsports,1.903e+06,1.54e+06,1.237,0.216,-1.11e+06,4.92e+06
dummy_basketball,-2.605e+04,3.34e+05,-0.078,0.938,-6.81e+05,6.29e+05
dummy_businessfinance,-2.604e+05,5.01e+05,-0.520,0.603,-1.24e+06,7.22e+05
dummy_businesspersonalities,-4.25e+05,1.87e+05,-2.276,0.023,-7.91e+05,-5.89e+04
dummy_celebrities,-1.282e+05,1.21e+05,-1.059,0.290,-3.65e+05,1.09e+05
dummy_competitionshows,-1.42e+05,3.01e+05,-0.472,0.637,-7.32e+05,4.48e+05
dummy_digitalcreators,-2.317e+05,5.62e+05,-0.412,0.680,-1.33e+06,8.7e+05

0,1,2,3
Omnibus:,32085.42,Durbin-Watson:,1.915
Prob(Omnibus):,0.0,Jarque-Bera (JB):,83652872.871
Skew:,14.738,Prob(JB):,0.0
Kurtosis:,347.311,Cond. No.,1.32e+18


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.628
Model:,OLS,Adj. R-squared:,0.627
Method:,Least Squares,F-statistic:,524.2
Date:,"Thu, 25 May 2023",Prob (F-statistic):,0.0
Time:,09:19:41,Log-Likelihood:,-31155.0
No. Observations:,16812,AIC:,62420.0
Df Residuals:,16757,BIC:,62850.0
Df Model:,54,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.0012,0.143,-7.020,0.000,-1.281,-0.722
dummy_americanfootball,1.3086,0.112,11.635,0.000,1.088,1.529
dummy_barstoolsports,6.0492,0.913,6.629,0.000,4.261,7.838
dummy_basketball,0.8883,0.198,4.483,0.000,0.500,1.277
dummy_businessfinance,0.5657,0.297,1.904,0.057,-0.017,1.148
dummy_businesspersonalities,-1.3005,0.111,-11.708,0.000,-1.518,-1.083
dummy_celebrities,-2.353e-05,0.073,-0.000,1.000,-0.144,0.144
dummy_competitionshows,-0.4762,0.179,-2.664,0.008,-0.827,-0.126
dummy_digitalcreators,-0.1045,0.333,-0.313,0.754,-0.758,0.549

0,1,2,3
Omnibus:,263.718,Durbin-Watson:,1.382
Prob(Omnibus):,0.0,Jarque-Bera (JB):,406.094
Skew:,-0.165,Prob(JB):,6.569999999999999e-89
Kurtosis:,3.686,Cond. No.,1370000000000000.0


0,1,2,3
Dep. Variable:,impression_count,R-squared:,0.625
Model:,OLS,Adj. R-squared:,0.625
Method:,Least Squares,F-statistic:,849.0
Date:,"Thu, 25 May 2023",Prob (F-statistic):,0.0
Time:,09:19:41,Log-Likelihood:,-31216.0
No. Observations:,16812,AIC:,62500.0
Df Residuals:,16778,BIC:,62760.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.1871,0.134,-8.849,0.000,-1.450,-0.924
dummy_americanfootball,1.2657,0.112,11.279,0.000,1.046,1.486
dummy_barstoolsports,6.0548,0.915,6.617,0.000,4.261,7.848
dummy_basketball,1.1006,0.115,9.592,0.000,0.876,1.325
dummy_businesspersonalities,-1.2675,0.104,-12.165,0.000,-1.472,-1.063
dummy_competitionshows,-0.3885,0.151,-2.572,0.010,-0.685,-0.092
dummy_events,0.2646,0.097,2.719,0.007,0.074,0.455
dummy_foodbeveragebusiness,0.3013,0.105,2.860,0.004,0.095,0.508
dummy_gamingbusiness,0.3223,0.113,2.845,0.004,0.100,0.544

0,1,2,3
Omnibus:,247.162,Durbin-Watson:,1.378
Prob(Omnibus):,0.0,Jarque-Bera (JB):,371.254
Skew:,-0.163,Prob(JB):,2.42e-81
Kurtosis:,3.651,Cond. No.,1570000000000000.0


## COEFFICIENTS PLOTS

In [191]:
# Assign significant levels stars to each variable name
def assign_stars(row):
    p_value = float(row['P>|t|'])
    name = row[0].replace('_', ' ').title()
    if p_value <= 0.01:
        return name + ' ***'
    elif p_value <= 0.05:
        return name + ' **'
    elif p_value <= 0.1:
        return name + ' *'
    else:
        return name

In [192]:
# Define function to output plot of the model coefficients

def coefplot(results):
    ### PREPARE DATA FOR PLOTTING
    # Create dataframe of results summary 
    coef_df = pd.DataFrame(results.summary().tables[1].data)
    
    # Add column names and drop the extra row with column labels
    coef_df.columns = coef_df.iloc[0]
    coef_df=coef_df.drop(0)

    # Rename column 0 and append * ** or *** for significance levels
    coef_df["index"] = coef_df.apply(lambda x: assign_stars(x), axis=1)
    coef_df = coef_df.drop(coef_df.columns[0], axis=1)

    # Set index to variable names
    coef_df = coef_df.set_index(coef_df["index"])
    display(coef_df)

    # Change datatype from object to float
    coef_df = coef_df.astype(float)

    # Get errors; (coef - lower bound of conf interval)
    errors = coef_df['coef'] - coef_df['[0.025']
    coef_df['errors'] = errors

    # Sort values by coef ascending
    coef_df = coef_df.sort_values(by=['coef'])

    ### PLOT COEFFICIENTS
    variables = list(coef_df.index.values)
    coef_df['variables'] = variables

    # Define figure, axes, and plot
    fig, ax = plt.subplots(figsize=(15, 10))
    
    # Error bars for 95% confidence interval
    # Can increase capsize to add whiskers
    coef_df.plot(x='variables', y='coef', kind='bar', ax=ax, color='none', fontsize=15, ecolor='steelblue', capsize=0, yerr='errors', legend=False)
    
    # Coefficients
    ax.scatter(x=np.arange(coef_df.shape[0]), marker='o', s=80, y=coef_df['coef'], color='steelblue')
    
    # Line to define zero on the y-axis
    ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
    
    # Set title & labels
    #plt.title('Coefficients of Features - 95% Confidence Intervals',fontsize=20)
    ax.set_ylabel('Coefficients',fontsize=15)
    ax.set_xlabel('',fontsize=15)

    # Rotate y ticks and move to the right side
    ax.yaxis.tick_right()
    plt.yticks(rotation=90, fontsize=15)
    
    return plt.show()

In [193]:
coefplot(res_reg[2])

Unnamed: 0_level_0,coef,std err,t,P>|t|,[0.025,0.975],index
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Intercept ***,-1.0012,0.143,-7.02,0.0,-1.281,-0.722,Intercept ***
Dummy Americanfootball ***,1.3086,0.112,11.635,0.0,1.088,1.529,Dummy Americanfootball ***
Dummy Barstoolsports ***,6.0492,0.913,6.629,0.0,4.261,7.838,Dummy Barstoolsports ***
Dummy Basketball ***,0.8883,0.198,4.483,0.0,0.5,1.277,Dummy Basketball ***
Dummy Businessfinance *,0.5657,0.297,1.904,0.057,-0.017,1.148,Dummy Businessfinance *
Dummy Businesspersonalities ***,-1.3005,0.111,-11.708,0.0,-1.518,-1.083,Dummy Businesspersonalities ***
Dummy Celebrities,-2.353e-05,0.073,-0.0,1.0,-0.144,0.144,Dummy Celebrities
Dummy Competitionshows ***,-0.4762,0.179,-2.664,0.008,-0.827,-0.126,Dummy Competitionshows ***
Dummy Digitalcreators,-0.1045,0.333,-0.313,0.754,-0.758,0.549,Dummy Digitalcreators
Dummy Elonmusk,1.7524,1.145,1.53,0.126,-0.492,3.997,Dummy Elonmusk


ValueError: could not convert string to float: 'Intercept ***'