In [1]:
!git clone https://github.com/anonymousindividual007/Multi-environment-Topic-Models

Cloning into 'Multi-environment-Topic-Models'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 72 (delta 35), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (72/72), 37.16 MiB | 8.04 MiB/s, done.
Resolving deltas: 100% (35/35), done.
Updating files: 100% (13/13), done.


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import statsmodels.api as sm
import zipfile
import os

In [3]:
zip_file_path_1 = '/content/Multi-environment-Topic-Models/VTM_train_topic_proportions_lda.csv.zip'
csv_file_name_1 = 'VTM_train_topic_proportions_lda.csv'

zip_file_path_2 = '/content/Multi-environment-Topic-Models/eatm2_topic_proportions.csv.zip'
csv_file_name_2 = 'eatm2_topic_proportions.csv'

zip_file_path_3 = '/content/Multi-environment-Topic-Models/ideology_energy_ci.csv.zip'
csv_file_name_3 = 'ideology_energy_ci.csv'

temp_dir = '/content/temp_dir'

if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

with zipfile.ZipFile(zip_file_path_1, 'r') as zip_ref:
    zip_ref.extract(csv_file_name_1, temp_dir)

with zipfile.ZipFile(zip_file_path_2, 'r') as zip_ref:
    zip_ref.extract(csv_file_name_2, temp_dir)

with zipfile.ZipFile(zip_file_path_3, 'r') as zip_ref:
    zip_ref.extract(csv_file_name_3, temp_dir)

csv_file_path_1 = os.path.join(temp_dir, csv_file_name_1)
csv_file_path_2 = os.path.join(temp_dir, csv_file_name_2)
csv_file_path_3 = os.path.join(temp_dir, csv_file_name_3)

vtm_topics = pd.read_csv(csv_file_path_1)
eatm2_topic = pd.read_csv(csv_file_path_2)
data = pd.read_csv(csv_file_path_3)

In [4]:
eatm2_topic = eatm2_topic.drop(columns=['Unnamed: 0'])

In [5]:
def process_row_eatm2(row):
    if row['Topic_28'] == max(row):
        return 1
    else:
        return 0

eatm2_topic['binary 28'] = eatm2_topic.apply(process_row_eatm2, axis=1)

In [6]:
def process_row_vtm_15(row):
    if row['15'] == max(row):
        return 1
    else:
        return 0
vtm_topics['binary 15'] = vtm_topics.apply(process_row_vtm_15, axis=1)

In [7]:
def process_row_vtm(row):
    # Calculate the maximum value among all densities
    max_value = max(row)
    # Check if the maximum value corresponds to either '15' or '21'
    if row['15'] == max_value or row['21'] == max_value:
        return 1
    else:
        return 0

vtm_topics['binary 15 and 21'] = vtm_topics.apply(process_row_vtm, axis=1)

In [8]:
# Create an indicator variable for the 'source' column
data['source_indicator'] = data['source'].apply(lambda x: 1 if x == 'Republican' else 0)

In [9]:
data

Unnamed: 0.1,Unnamed: 0,text,source,is_energy,source_indicator
0,0,"One Mark Critts is plenty, but there seems to...",Republican,False,1
1,1,His name is Matt Shaner. He's a successful bu...,Republican,False,1
2,2,Me llamo Gabriel Gomez y no soy un poltico. N...,Republican,False,1
3,3,"Mark Begich says, When you think about the wo...",Republican,False,1
4,4,I'm Joe Heck and I approve this message. John...,Republican,False,1
...,...,...,...,...,...
25877,25877,People in Washington think New Mexico is noth...,Democratic,False,0
25878,25878,Governor George Ryan scandal political scanda...,Democratic,False,0
25879,25879,We are veterans. We served our country. World ...,Democratic,False,0
25880,25880,"Under George Ryan, the deficit grew to a reco...",Democratic,False,0


In [10]:
merged_df_vtm = data.merge(vtm_topics, left_index=True, right_index=True)
merged_df_eatm2 = data.merge(eatm2_topic, left_index=True, right_index=True)


In [11]:
# Sampling
energy_true_sample_2 = merged_df_eatm2[merged_df_eatm2['is_energy'] == True].sample(n=700, random_state=1)
energy_false_sample_2 = merged_df_eatm2[merged_df_eatm2['is_energy'] == False].sample(n=700, random_state=1)
final_eatm2 = pd.concat([energy_true_sample_2, energy_false_sample_2])

In [12]:
# Sampling
energy_true_sample_1 = merged_df_vtm[merged_df_vtm['is_energy'] == True].sample(n=700, random_state=1)
energy_false_sample_1 = merged_df_vtm[merged_df_vtm['is_energy'] == False].sample(n=700, random_state=1)
final_vtm = pd.concat([energy_true_sample_1, energy_false_sample_1])

In [13]:
final_vtm['combine1521'] = final_vtm['21'] + final_vtm['15']

In [14]:
baseline_prob = 0.5


In [15]:
def adjust_probability(row, baseline_prob=0.5, seed=None):
    if row['is_energy']:
        noise = np.random.uniform(0, 0.1)
        adjusted_prob = min(baseline_prob + 0.2 + noise, 1)
        return np.random.binomial(1, adjusted_prob)
    else:
        return np.random.binomial(1, baseline_prob)

# Initialize oucomes with a fixed seed for reproducibility
np.random.seed(100)
final_vtm['outcome'] = np.random.binomial(1, baseline_prob, size=len(final_vtm))
final_eatm2['outcome']=np.random.binomial(1, baseline_prob, size=len(final_eatm2))
final_vtm['outcome'] = final_vtm.apply(adjust_probability, axis=1)
final_eatm2['outcome'] = final_eatm2.apply(adjust_probability, axis=1)



In [16]:
X_2 = final_eatm2[['binary 28', 'source_indicator']]
y_2 = final_eatm2['outcome']
X_2 = sm.add_constant(X_2)
model_eatm2 = sm.OLS(y_2, X_2).fit(cov_type='HC3')
print(model_eatm2.summary())

                            OLS Regression Results                            
Dep. Variable:                outcome   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     19.72
Date:                Wed, 22 May 2024   Prob (F-statistic):           3.59e-09
Time:                        13:51:41   Log-Likelihood:                -956.42
No. Observations:                1400   AIC:                             1919.
Df Residuals:                    1397   BIC:                             1935.
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.5976      0.018  

In [17]:
vtm_X_2 = final_vtm[['binary 15', 'source_indicator']]
vtm_y_2 = final_vtm['outcome']
vX_2 = sm.add_constant(vtm_X_2)
model_vtm_2 = sm.OLS(vtm_y_2, vX_2).fit(cov_type='HC3')
print(model_vtm_2.summary())

                            OLS Regression Results                            
Dep. Variable:                outcome   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     2.627
Date:                Wed, 22 May 2024   Prob (F-statistic):             0.0726
Time:                        13:51:41   Log-Likelihood:                -983.24
No. Observations:                1400   AIC:                             1972.
Df Residuals:                    1397   BIC:                             1988.
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.6183      0.018  

In [18]:
vtm_X = final_vtm[['binary 15 and 21','source_indicator']]
vtm_y = final_vtm['outcome']
vX = sm.add_constant(vtm_X)
model_vtm = sm.OLS(vtm_y, vX).fit(cov_type='HC3')
print(model_vtm.summary())

                            OLS Regression Results                            
Dep. Variable:                outcome   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     2.748
Date:                Wed, 22 May 2024   Prob (F-statistic):             0.0644
Time:                        13:51:41   Log-Likelihood:                -983.09
No. Observations:                1400   AIC:                             1972.
Df Residuals:                    1397   BIC:                             1988.
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.6204      0.017  