# Ols

In [2]:
from ols import *
THRESHOLD = 1 # Threshold to remove outliers

def process_country2region(data):
    country_groups = {
    'North America': ['CANADA', 'UNITED STATES'],
    'Central and South America': ['MEXICO', 'BRAZIL', 'CHILE', 'ECUADOR'],
    'North Europe': ['NORWAY', 'FINLAND', 'DENMARK', 'SWEDEN',  'LITHUANIA', 'LATVIA', 'ESTONIA', 'UNITED KINGDOM', 'IRELAND'],
    'South Europe': ['ITALY', 'SPAIN', 'PORTUGAL', 'GREECE', 'CROATIA', 'CYPRUS', 'SLOVENIA', 'GIBRALTAR', 'MALTA', 'ALBANIA'],
    'East Europe': ['CZECH REPUBLIC', 'POLAND', 'BOSNIA-HERZEGOVINA',  'ROMANIA', 'HUNGARY', 'BULGARIA', 'SLOVAKIA', 'SERBIA', 'GEORGIA', 'KOSOVO', 'KOSOVO (*)', 'UKRAINE', 'RUSSIA, FEDERATION OF', 'NORTH MACEDONIA'],
    'West Europe': ['AUSTRIA', 'GERMANY', 'SWITZERLAND', 'BELGIUM', 'LUXEMBOURG', 'FRANCE', 'NETHERLANDS', 'MONACO'],
    'East Asia': ['JAPAN', 'CHINA', 'HONG KONG', 'TAIWAN', 'KOREA, REPUBLIC OF', 'VIETNAM', 'MALAYSIA', 'THAILAND'],
    'South Asia' : ['INDIA'],
    'West Asia' : ['TURKEY'],
    'Middle east' : ['UNITED ARAB EMIRATES', 'QATAR', 'ISRAEL', 'LEBANON', 'PALESTINE (*)'],
    'Oeania' : [ 'SINGAPORE', 'AUSTRALIA','NEW ZEALAND', 'INDONESIA'],
    'Other': ['LIBERIA', 'RUANDA', 'SOUTH AFRICA', 'BERMUDA', 'GHANA', 'KAZAKHSTAN', 'EGYPT', 'SENEGAL', 'KENYA', 'MAYOTTE', 'MOROCCO', 'TUNISIA', 'NIGERIA', 'TOGO', 'GABON']
    }
    country_aff = {}
    for region in country_groups.keys():
        for c in country_groups[region]:
            country_aff [c] = region
    data['TR Country'] = data['TR Country'].apply(lambda x : country_aff[x])
    # Remove outliers
    data = remove_outliers(data, 'TR Country', THRESHOLD)
    # One hot encoding
    data = one_hot_encod(data, 'TR Country', 'East Europe')
    
    return data

def process_coutry_eu_member(data):
    eu_members = {'AUSTRIA', 'GERMANY', 'BELGIUM', 'LUXEMBOURG', 'FRANCE', 'NETHERLANDS',
                   'CZECH REPUBLIC', 'POLAND',   'ROMANIA', 'HUNGARY', 'BULGARIA', 'SLOVAKIA',
                   'ITALY', 'SPAIN', 'PORTUGAL', 'GREECE', 'CROATIA', 'CYPRUS', 'SLOVENIA', 'MALTA',
                   'FINLAND', 'DENMARK', 'SWEDEN',  'LITHUANIA', 'LATVIA', 'ESTONIA', 'IRELAND' }
    data['TR Country'] = data['TR Country'].apply(lambda x : 'Head of office in EU country' if x in eu_members else 'N')
    # Remove outliers
    data = remove_outliers(data, 'TR Country', THRESHOLD)
    # One hot encoding
    data = one_hot_encod(data, 'TR Country', 'N')
    return data

def process_category(data):
    # Remove outliers
    data = remove_outliers(data, 'Category of registration', THRESHOLD)
    # One hot encoding
    data = one_hot_encod(data, 'Category of registration', 'Companies and groups')
    return data

def process_sectors(data):
    # Remove outliers
    data = remove_outliers(data, 'NACE', THRESHOLD)
    # One hot encoding
    data = one_hot_encod(data, 'NACE', 'C - Manufacturing')
    return data

def process_data(data, traget, variables):
    basetable = data[[target] + variables].dropna()

    # Get columns with similar values
    sim_val_cols = get_similar_value_cols(basetable, percent=80)
    basetable = basetable.drop(columns = sim_val_cols)

    #Process categorical variables
    # if 'TR Country' in basetable.columns:
    #     basetable = process_country2region(basetable)
    if 'TR Country' in basetable.columns:
        basetable = process_coutry_eu_member(basetable)
    if 'NACE' in basetable.columns:
        basetable = process_sectors(basetable)
    if 'Category of registration' in basetable.columns:
        basetable = process_category(basetable)

    # Transform skewed numerical data
    columns = list(get_skewed_columns(basetable.select_dtypes(include='number')).index)
    basetable = transform_skewed(basetable, columns,'log')

    #Normalize data
    basetable = pd.DataFrame(preprocessing.StandardScaler().fit_transform(basetable) , columns = basetable.columns, index = basetable.index)

    variables = [col for col in basetable.columns if col != target]
    return basetable[target], basetable[variables]



In [6]:
lobbying = ['Members FTE']
financials = ['Assets']
country = ['TR Country']
sector = ['NACE']
level = ['Level European', 'Level Regional/Local', 'Level Global',
       'Level National']
category = ['Category of registration']

data  = pd.read_csv('../out/reg_data.csv')
#data['Hypercoreness_sub']= data['Hypercoreness_sub'].fillna(0)
target = 'Hypercoreness_False'
variables = lobbying  + financials + level + country + sector
y, X = process_data(data, target, variables )
results = sm.OLS(y ,X.assign(const = 1)).fit()
print(results.summary())

Total columns with majority singular value shares:  1 ['Level Regional/Local']
removed category:  []
number of outliers:  0
removed category:  ['S - Other service activities', 'E - Water supply, sewerage, waste management and remediation activities', 'R - Arts, entertainment and recreation', 'I - Accommodation and food service activities', 'L - Real estate activities', 'F - Construction', 'O - Public administration and defence, compulsory social security', 'A - Agriculture, forestry and fishing', 'P - Education', 'Q - Human health and social work activities']
number of outliers:  61
                             OLS Regression Results                            
Dep. Variable:     Hypercoreness_False   R-squared:                       0.398
Model:                             OLS   Adj. R-squared:                  0.390
Method:                  Least Squares   F-statistic:                     54.81
Date:                 Thu, 17 Jul 2025   Prob (F-statistic):          2.40e-117
Time:     

In [13]:
#data['Hypercoreness_sub_False']= data['Hypercoreness_sub_False'].fillna(0)

df_regressions = pd.DataFrame()
variables = lobbying + financials +sector +level + country
#variables = category
for target in  ['Hypercoreness_False', 'Hypercoreness_sub_False']:
    variables = lobbying  +level + country
    y, X = process_data(data, target, variables )
    results = sm.OLS(y ,X.assign(const = 1)).fit()
    print(results.summary())

    df = pd.DataFrame(results.params.round(3), columns = [target])
    df.loc[(0.01<results.pvalues) & (results.pvalues <0.05)] = df.astype(str) + "*"
    df.loc[(0.001<results.pvalues) & (results.pvalues <0.01)] = df.astype(str) + "**"
    df.loc[results.pvalues <0.001] = df.astype(str) + "***"
    df.loc[results.pvalues >0.05] = df.astype(str) +  ' '
        # Add extra rows for statistics
    df.loc['Observations'] = int(results.nobs)
    df.loc['R-squared'] = str(results.rsquared.round(3))
    df.loc['AIC'] = str(results.aic.round(3))
    df.loc['BIC'] = str(results.bic.round(3))

    df_regressions = pd.concat([df_regressions, df], axis = 1)

df_regressions.columns = df_regressions.columns.str.replace('_', ' ')
print(df_regressions.to_latex(na_rep = ''))

Total columns with majority singular value shares:  1 ['Level Regional/Local']
removed category:  []
number of outliers:  0
                             OLS Regression Results                            
Dep. Variable:     Hypercoreness_False   R-squared:                       0.213
Model:                             OLS   Adj. R-squared:                  0.212
Method:                  Least Squares   F-statistic:                     280.3
Date:                 Thu, 17 Jul 2025   Prob (F-statistic):          4.55e-266
Time:                         17:25:51   Log-Likelihood:                -6736.3
No. Observations:                 5185   AIC:                         1.348e+04
Df Residuals:                     5179   BIC:                         1.352e+04
Df Model:                            5                                         
Covariance Type:             nonrobust                                         
                                   coef    std err          t      P>|t|    

  df.loc[results.pvalues <0.001] = df.astype(str) + "***"
  df.loc[(0.01<results.pvalues) & (results.pvalues <0.05)] = df.astype(str) + "*"
