In [241]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import sys

from scipy import stats
from pingouin import kruskal

import seaborn as sns
import pylab as plt
pd.set_option('display.max_columns', None)

In [142]:
sid_obj = SentimentIntensityAnalyzer()

In [299]:
def get_normalized_dist(dist):
    
    return (dist - dist.min()) / (dist.max() - dist.min())

def get_normalized_senti(dist, avg_senti):
    
    return dist - avg_senti

def get_vader_senti_scores(df):

    senti_neg, senti_neu, senti_pos, senti_comp = [], [], [], []
    for hl in df.text:
        # print(hl)
        sentiment_dict = sid_obj.polarity_scores(hl)
        senti_neg.append(sentiment_dict['neg'])
        senti_neu.append(sentiment_dict['neu'])
        senti_pos.append(sentiment_dict['pos'])
        senti_comp.append(sentiment_dict['compound'])

    df['vad_neg'] = senti_neg
    df['vad_neu'] = senti_neu
    df['vad_pos'] = senti_pos
    df['vad_comp'] = senti_comp

    return df

def get_vad_sentiments(df):

    publ, avg_neg, avg_pos, avg_neu, avg_comp = [], [], [], [], []
    for pub, grp_df in df.groupby('publication'):
        publ.append(pub)

        dist1 = get_normalized_dist(grp_df.vad_neg)
        dist1 =  dist1[dist1 != 0]

        dist2 = get_normalized_dist(grp_df.vad_neu)
        dist2 =  dist2[dist2 != 0]

        dist3 = get_normalized_dist(grp_df.vad_pos)
        dist3 =  dist3[dist3 != 0]

        dist4 = get_normalized_dist(grp_df.vad_comp)
        dist4 =  dist4[dist4 != 0]

        avg_neg.append(dist1.mean())
        avg_neu.append(dist2.mean())
        avg_pos.append(dist3.mean())
        avg_comp.append(dist4.mean())
        # sys.exit()
        
    vad_avg_df = pd.DataFrame(list(zip(publ, avg_neg, avg_neu, avg_pos, avg_comp)), columns = ['pub', 'neg', 'neu', 'pos', 'comp'])
    
    return vad_avg_df

def get_df_for_plot(df, pub):
    
    # print(pub)
    df = df[df['Pub'] == pub]
    # print(df)
    t = df[['Pub', 'Trump', 'Sentiment']]
    t['entity'] = 'Trump'
    t1 = t.rename(columns={'Trump':'score', 'Sentiment': 'sentiment'})

    t = df[['Pub', 'Biden', 'Sentiment']]
    t['entity'] = 'Biden'
    t2 = t.rename(columns={'Biden':'score', 'Sentiment': 'sentiment'})

    return pd.concat((t1,t2), axis = 0)

def get_mean_scores(df, pub, entity, col):
    
    df = df[df[entity + '_flag'] == 1]
    # print(df.shape)
    return df[df[col] == pub][entity + '_neg'].mean(), df[df[col] == pub][entity + '_pos'].mean(), df[df[col] == pub][entity + '_neu'].mean()

def get_TB_senti_dist(df, pub, senti, col):
    
    temp_df = df[((df[col] == pub) & (df['trump_flag'] == True))]
    senti_dist1 = temp_df['trump_' + senti]
    
    temp_df = df[((df[col] == pub) & (df['biden_flag'] == True))]
    senti_dist2 = temp_df['biden_' + senti]
    
    return senti_dist1, senti_dist2

def welch_ttest(x, y): 
    ## Welch-Satterthwaite Degrees of Freedom ##
    dof = (x.var()/x.size + y.var()/y.size)**2 / ((x.var()/x.size)**2 / (x.size-1) + (y.var()/y.size)**2 / (y.size-1))
   
    t, p = stats.ttest_ind(x, y, equal_var = False)
    return round(t,4), round(p,4), round(dof,4)

def get_KW_stat(emot1, emot2):
    
    df1 = pd.DataFrame(emot1.values, columns = ['senti'])
    df1['type_flag'] = 1

    df2 = pd.DataFrame(emot2.values, columns = ['senti'])
    df2['type_flag'] = 2
    
    temp_df = pd.concat((df1,df2), axis = 0)
    kwTest = kruskal(temp_df, dv='senti', between='type_flag')
    H = kwTest.H[0]
    p = kwTest['p-unc'][0]
    dof = kwTest['ddof1'][0]
    n = temp_df.shape[0]
    esq = H * (n + 1)/(n**2 - 1)
    return H, p, dof, esq

def get_cohen_d_for_welch_test(x,y):
    
    return np.abs(np.mean(x) - np.mean(y)) / np.sqrt( (np.var(x) + np.var(y)) /2)

def mean_normalization(dist, avg_senti):
    
    dist = dist - avg_senti
    dist /= dist.std()
    
    return dist

def get_test_scores(df):

    comp = 'biden-trump'
    senti = 'pos'
    # pub = 'CNN'
    stest = []
    for senti in ['neg', 'neu', 'pos']:
        for pub in publishers:
            dist1 = df[((df['publication'] == pub) & (df['biden_flag'] == True))]['biden_' + senti]
            dist2 = df[((df['publication'] == pub) & (df['trump_flag'] == True))]['trump_' + senti]

            dist1 = dist1[dist1 != 0]
            dist2 = dist2[dist2 != 0]

            dist1 = get_normalized_dist(dist1)
            dist2 = get_normalized_dist(dist2)

            avg_senti = vad_avg_df[vad_avg_df['pub'] == pub][senti]
  
            # print(dist1.shape, dist2.shape)
    
            dist1 = mean_normalization(dist1, avg_senti.values[0])            
            dist2 = mean_normalization(dist2, avg_senti.values[0])
            
            # m = min(dist1.shape[0], dist2.shape[0])
            # dist1 = dist1.sample(m)
            # dist2 = dist2.sample(m)
            
            # m = dist2.shape[0]
            # dist1 = dist1.sample(m, replace = True)
            # dist2 = dist2.sample(m)

            # print(dist1)
            # sys.exit()

            emot1_shapiro = stats.shapiro(dist1)
            emot2_shapiro = stats.shapiro(dist2)
            emot1_shapiro = (emot1_shapiro.statistic, emot1_shapiro.pvalue)
            emot2_shapiro = (emot2_shapiro.statistic, emot2_shapiro.pvalue)
            H, p, dof, es = get_KW_stat(dist1, dist2)
            stat_manW, p_manW = mannwhitneyu(dist1, dist2)
            stat_welch, p_welch, dof_welch = welch_ttest(dist1, dist2)
            # stat_wilC, p_wilC = wilcoxon(dist1, dist2)
            
            # es2 = pearsonr(dist1, dist2)
            es2 = pg.compute_effsize(dist1, dist2, paired=False, eftype='cohen')
            es3 = pg.compute_effsize(dist1, dist2, paired=False, eftype='eta-square')
            es4 = pg.compute_effsize(dist1, dist2, paired=False, eftype='hedges')
            es5 = get_cohen_d_for_welch_test(dist1, dist2)
            # print(es2, es3, es4)
            
            # sys.exit()
            
            res = [comp, pub, senti, dist1.shape[0], dist2.shape[0], emot1_shapiro[0], emot1_shapiro[1], emot2_shapiro[0], emot2_shapiro[1], p, p_welch, p_manW, H, dof, stat_welch, dof_welch, 
                   stat_manW, es, es2, es5, es3, es4]
            stest.append(res)

    return stest

In [300]:
from scipy.stats import pearsonr
import pingouin as pg
from scipy.stats import mannwhitneyu, wilcoxon

In [301]:
stest = get_test_scores(df)

In [310]:
for senti in ['neg', 'neu', 'pos']:
    for pub in publishers:
        dist1 = df[((df['publication'] == pub) & (df['trump_flag'] == True))]['trump_' + senti]
        dist2 = df[((df['publication'] == pub) & (df['biden_flag'] == True))]['biden_' + senti]
        print(senti, round(dist1.mean(), 2), round(dist2.mean(), 2), pub)

neg 0.46 0.12 CNN
neg 0.48 0.11 The Washington Post
neg 0.43 0.14 Business Insider
neg 0.31 0.13 USA Today
neg 0.22 0.15 Fox News
neg 0.3 0.29 Breitbart News
neu 0.25 0.39 CNN
neu 0.23 0.48 The Washington Post
neu 0.26 0.42 Business Insider
neu 0.37 0.44 USA Today
neu 0.47 0.52 Fox News
neu 0.33 0.33 Breitbart News
pos 0.09 0.16 CNN
pos 0.08 0.09 The Washington Post
pos 0.08 0.1 Business Insider
pos 0.09 0.12 USA Today
pos 0.09 0.01 Fox News
pos 0.15 0.06 Breitbart News


In [289]:
publishers = ['CNN', 'The Washington Post', 'Business Insider', 'USA Today', 'Fox News', 'Breitbart News']
# df = pd.read_csv('results_old/target_senti_entity_based.csv')
df = pd.read_parquet('sampled_dataset/news_headlines.parquet')
df = df[((df['trump_flag'] == True) | (df['biden_flag'] == True))][df['publication'].isin(publishers)]
df['text'] = df['title']
df.shape

(28026, 21)

In [148]:
# df['date'] = pd.to_datetime(df['date'])
# df['day'] = df.date.dt.day
# df['month'] = df.date.dt.month
# df['year'] = df.date.dt.year
# df['timestamp'] = df.date.dt.time

# user_resp_2020 = df[df.year == 2020]
# user_resp_fil_months = user_resp_2020[user_resp_2020.month.isin([5,6,7,8,9,10,11])]
# final_user_resp_df = user_resp_fil_months[((user_resp_fil_months.month.isin([6,7,8,9,10])) | 
#                       ((user_resp_fil_months.month == 5) & (user_resp_fil_months.day >= 3)) | 
#                       ((user_resp_fil_months.month == 11) & (user_resp_fil_months.day < 3)))]

# final_user_resp_df.to_parquet('dataset/news_headlines.parquet')

In [293]:
# df = pd.read_csv('results_old/news_tweets_senti_scores.csv')
df = pd.read_parquet('sampled_dataset/news_tweets.parquet')
df['publication'] = df['publication'].str.replace('USA TODAY', 'USA Today')
df = df[((df['trump_flag'] == True) | (df['biden_flag'] == True))][df['publication'].isin(publishers)]
df.shape

(15738, 39)

In [302]:
# df = get_vader_senti_scores(df)
vad_avg_df = get_vad_sentiments(get_vader_senti_scores(df))

In [303]:
vad_avg_df

Unnamed: 0,pub,neg,neu,pos,comp
0,Breitbart News,0.203467,0.805216,0.242541,0.502743
1,Business Insider,0.323748,0.723749,0.308527,0.464062
2,CNN,0.284286,0.733114,0.245035,0.490235
3,Fox News,0.452489,0.710364,0.397922,0.492225
4,The Washington Post,0.320769,0.738222,0.29375,0.467958
5,USA Today,0.263956,0.752015,0.279194,0.499584


In [304]:
stest = get_test_scores(df)
stats_df_ = pd.DataFrame(stest, columns = ['comparison', 'pub', 'senti', 'Sample size 1', 'sample size 2', 'shapiro-t-dist1', 'shapiro-p-dist1', 'shapiro-t-dist12', 'shapiro-p-dist2', 
                                           'KW-p', 'Wel-p', 'ManW-p', 'KW-H', 'KW-dof', 'Wel-stat', 'Wel-dof', 'ManW-stat', 'ES-epsilon-sqr', 'ES-cohens-d', 'ES-cohens-d2', 'ES-eta-sqr', 'ES-hedges'])
stats_df_ = stats_df_.round(2)
stats_df_

Unnamed: 0,comparison,pub,senti,Sample size 1,sample size 2,shapiro-t-dist1,shapiro-p-dist1,shapiro-t-dist12,shapiro-p-dist2,KW-p,Wel-p,ManW-p,KW-H,KW-dof,Wel-stat,Wel-dof,ManW-stat,ES-epsilon-sqr,ES-cohens-d,ES-cohens-d2,ES-eta-sqr,ES-hedges
0,biden-trump,CNN,neg,177,2127,0.96,0.0,0.8,0.0,0.0,0.0,0.0,165.04,1,-13.5,206.39,78993.0,0.07,-1.06,1.06,0.22,-1.06
1,biden-trump,The Washington Post,neg,147,2190,0.93,0.0,0.82,0.0,0.0,0.0,0.0,142.76,1,-13.04,166.21,66339.0,0.06,-1.11,1.11,0.24,-1.11
2,biden-trump,Business Insider,neg,88,1028,0.96,0.01,0.85,0.0,0.0,0.0,0.0,94.31,1,-10.96,102.47,17051.0,0.08,-1.22,1.22,0.27,-1.22
3,biden-trump,USA Today,neg,98,616,0.94,0.0,0.86,0.0,0.0,0.0,0.0,74.94,1,-9.12,129.8,13766.0,0.11,-0.99,1.0,0.2,-0.99
4,biden-trump,Fox News,neg,34,47,0.96,0.27,0.91,0.0,0.17,0.15,0.18,1.85,1,-1.47,71.26,657.0,0.02,-0.33,0.34,0.03,-0.33
5,biden-trump,Breitbart News,neg,461,582,0.95,0.0,0.83,0.0,0.0,0.0,0.0,114.96,1,-10.56,987.03,82349.0,0.11,-0.66,0.66,0.1,-0.66
6,biden-trump,CNN,neu,584,1327,0.98,0.0,0.95,0.0,0.0,0.0,0.0,34.22,1,-5.84,1114.18,322481.0,0.02,-0.29,0.29,0.02,-0.29
7,biden-trump,The Washington Post,neu,615,1226,0.98,0.0,0.95,0.0,0.0,0.0,0.0,18.11,1,-4.39,1229.44,331207.0,0.01,-0.22,0.22,0.01,-0.22
8,biden-trump,Business Insider,neu,263,747,0.97,0.0,0.95,0.0,0.0,0.0,0.0,27.25,1,-5.3,458.98,76994.0,0.03,-0.38,0.38,0.03,-0.38
9,biden-trump,USA Today,neu,341,790,0.97,0.0,0.95,0.0,0.0,0.0,0.0,63.12,1,-8.17,645.07,94645.0,0.06,-0.53,0.53,0.07,-0.53


In [264]:
stest = get_test_scores(df)
stats_df_ = pd.DataFrame(stest, columns = ['comparison', 'pub', 'senti', 'Sample size 1', 'sample size 2', 'shapiro-t-dist1', 'shapiro-p-dist1', 'shapiro-t-dist12', 'shapiro-p-dist2', 
                                           'KW-p', 'Wel-p', 'ManW-p', 'KW-H', 'KW-dof', 'Wel-stat', 'Wel-dof', 'ManW-stat', 'ES-epsilon-sqr', 'ES-cohens-d', 'ES-cohens-d2', 'ES-eta-sqr', 'ES-hedges'])
stats_df_ = stats_df_.round(2)
stats_df_

Unnamed: 0,comparison,pub,senti,Sample size 1,sample size 2,shapiro-t-dist1,shapiro-p-dist1,shapiro-t-dist12,shapiro-p-dist2,KW-p,Wel-p,ManW-p,KW-H,KW-dof,Wel-stat,Wel-dof,ManW-stat,ES-epsilon-sqr,ES-cohens-d,ES-cohens-d2,ES-eta-sqr,ES-hedges
0,biden-trump,CNN,neg,177,2127,0.96,0.0,0.8,0.0,0.0,0.0,0.0,146.72,1,-12.53,203.97,85233.0,0.06,-1.01,1.0,0.2,-1.01
1,biden-trump,The Washington Post,neg,147,2190,0.93,0.0,0.82,0.0,0.0,0.0,0.0,110.88,1,-11.31,162.98,77571.0,0.05,-1.04,1.0,0.21,-1.04
2,biden-trump,Business Insider,neg,88,1028,0.96,0.01,0.85,0.0,0.0,0.0,0.0,70.19,1,-8.76,98.36,20920.0,0.06,-1.1,1.04,0.23,-1.1
3,biden-trump,USA Today,neg,98,616,0.94,0.0,0.86,0.0,0.0,0.0,0.0,48.74,1,-7.13,122.37,16944.0,0.07,-0.85,0.81,0.15,-0.85
4,biden-trump,Fox News,neg,34,47,0.96,0.27,0.91,0.0,0.12,0.11,0.12,2.37,1,-1.63,74.47,638.0,0.03,-0.36,0.37,0.03,-0.36
5,biden-trump,Breitbart News,neg,461,582,0.95,0.0,0.83,0.0,0.0,0.0,0.0,112.89,1,-10.48,985.92,82817.0,0.11,-0.65,0.65,0.1,-0.65
6,biden-trump,CNN,neu,584,1327,0.98,0.0,0.95,0.0,0.0,0.0,0.0,29.56,1,-5.38,1156.95,327065.0,0.02,-0.26,0.27,0.02,-0.26
7,biden-trump,The Washington Post,neu,615,1226,0.98,0.0,0.95,0.0,0.0,0.0,0.0,8.67,1,-2.86,1357.0,345323.0,0.0,-0.14,0.14,0.0,-0.14
8,biden-trump,Business Insider,neu,263,747,0.97,0.0,0.95,0.0,0.0,0.0,0.0,21.42,1,-4.76,485.39,79402.0,0.02,-0.33,0.34,0.03,-0.33
9,biden-trump,USA Today,neu,341,790,0.97,0.0,0.95,0.0,0.0,0.0,0.0,64.15,1,-8.25,640.27,94318.0,0.06,-0.54,0.54,0.07,-0.54


In [250]:
stest = get_test_scores(df)
stats_df_ = pd.DataFrame(stest, columns = ['comparison', 'pub', 'senti', 'Sample size 1', 'sample size 2', 'shapiro-t-dist1', 'shapiro-p-dist1', 'shapiro-t-dist12', 'shapiro-p-dist2', 
                                           'KW-p', 'Wel-p', 'ManW-p', 'KW-H', 'KW-dof', 'Wel-stat', 'Wel-dof', 'ManW-stat', 'ES-epsilon-sqr', 'ES-cohens-d', 'ES-cohens-d2', 'ES-eta-sqr', 'ES-hedges'])
stats_df_ = stats_df_.round(2)
stats_df_

Unnamed: 0,comparison,pub,senti,Sample size 1,sample size 2,shapiro-t-dist1,shapiro-p-dist1,shapiro-t-dist12,shapiro-p-dist2,KW-p,Wel-p,ManW-p,KW-H,KW-dof,Wel-stat,Wel-dof,ManW-stat,ES-epsilon-sqr,ES-cohens-d,ES-cohens-d2,ES-eta-sqr,ES-hedges
0,biden-trump,CNN,neg,177,177,0.96,0.0,0.77,0.0,0.0,0.0,0.0,93.87,1,-10.44,348.43,6337.0,0.27,-1.11,1.11,0.24,-1.11
1,biden-trump,The Washington Post,neg,147,147,0.93,0.0,0.79,0.0,0.0,0.0,0.0,61.6,1,-8.94,288.85,5084.0,0.21,-1.04,1.05,0.21,-1.04
2,biden-trump,Business Insider,neg,88,88,0.96,0.01,0.81,0.0,0.0,0.0,0.0,45.79,1,-7.58,170.15,1585.0,0.26,-1.14,1.15,0.25,-1.14
3,biden-trump,USA Today,neg,98,98,0.94,0.0,0.86,0.0,0.0,0.0,0.0,28.02,1,-5.69,190.91,2700.0,0.14,-0.81,0.82,0.14,-0.81
4,biden-trump,Fox News,neg,34,34,0.96,0.27,0.88,0.0,0.06,0.05,0.06,3.52,1,-1.97,65.98,425.0,0.05,-0.48,0.48,0.05,-0.47
5,biden-trump,Breitbart News,neg,461,461,0.95,0.0,0.83,0.0,0.0,0.0,0.0,107.14,1,-10.22,919.94,64411.0,0.12,-0.67,0.67,0.1,-0.67
6,biden-trump,CNN,neu,584,584,0.98,0.0,0.96,0.0,0.0,0.0,0.0,18.04,1,-4.26,1165.66,146046.5,0.02,-0.25,0.25,0.02,-0.25
7,biden-trump,The Washington Post,neu,615,615,0.98,0.0,0.94,0.0,0.0,0.0,0.0,9.91,1,-2.99,1209.83,169503.0,0.01,-0.17,0.17,0.01,-0.17
8,biden-trump,Business Insider,neu,263,263,0.97,0.0,0.95,0.0,0.0,0.0,0.0,18.83,1,-4.46,523.07,27021.5,0.04,-0.39,0.39,0.04,-0.39
9,biden-trump,USA Today,neu,341,341,0.97,0.0,0.94,0.0,0.0,0.0,0.0,39.88,1,-6.48,679.93,41894.0,0.06,-0.5,0.5,0.06,-0.5


## Sentiment Analysis of headlines
### Within Media outlets

In [277]:
comp = 'biden-trump'
senti = 'pos'
# pub = 'CNN'
effect_sizes = []
for pub in publishers:
    total_es, total_samples = 0, 0
    for senti in ['neg', 'neu', 'pos']:
        dist1 = df[((df['publication'] == pub) & (df['biden_flag'] == True))]
        dist2 = df[((df['publication'] == pub) & (df['trump_flag'] == True))]
        
        # print(temp_df.drop_duplicates().shape, temp_df.shape)
        # print(temp_df.columns)
        
        temp_df = pd.concat((dist1, dist2), axis = 0)
        # print(dist1.shape, dist2.shape, temp_df.shape)
        temp_df = temp_df.drop_duplicates('id')
        temp_df = temp_df[((temp_df['biden_' + senti] != 0) | (temp_df['trump_' + senti] != 0))]
        
        num_samples = temp_df.shape[0]
        # es = stats_df[((stats_df['pub'] == pub) & (stats_df['senti'] == senti))]['ES'].values[0]
        es = stats_df_[((stats_df_['pub'] == pub) & (stats_df_['senti'] == senti))]['ES-cohens-d2'].values[0]
        
        total_es += (es * num_samples)
        total_samples += num_samples
        
        # print(total_es)
        # print(total_samples)
        
        # sys.exit()
        
    new_es = float(total_es/total_samples)
    effect_sizes.append(new_es)

In [278]:
news_tweets_es_ = pd.DataFrame(list(zip(publishers, effect_sizes)), columns = ['pub', 'ES'])
news_tweets_es_

Unnamed: 0,pub,ES
0,CNN,0.746729
1,The Washington Post,0.742075
2,Business Insider,0.630024
3,USA Today,0.584716
4,Fox News,0.587282
5,Breitbart News,0.712207


In [271]:
news_headlines_es_ = pd.DataFrame(list(zip(publishers, effect_sizes)), columns = ['pub', 'ES'])
news_headlines_es_

Unnamed: 0,pub,ES
0,CNN,0.589174
1,The Washington Post,0.561563
2,Business Insider,0.64192
3,USA Today,0.590133
4,Fox News,0.35453
5,Breitbart News,0.571236


In [74]:
news_tweets_es = pd.DataFrame(list(zip(publishers, effect_sizes)), columns = ['pub', 'ES'])
news_tweets_es

Unnamed: 0,pub,ES
0,CNN,0.046653
1,The Washington Post,0.041492
2,Business Insider,0.04337
3,USA Today,0.066365
4,Fox News,0.062278
5,Breitbart News,0.093368


In [58]:
news_headlines_es = pd.DataFrame(list(zip(publishers, effect_sizes)), columns = ['pub', 'ES'])
news_headlines_es

Unnamed: 0,pub,ES
0,CNN,0.057619
1,The Washington Post,0.047762
2,Business Insider,0.050513
3,USA Today,0.046071
4,Fox News,0.083135
5,Breitbart News,0.104468


In [6]:
# user_resp = pd.read_parquet('dataset/user_resp_newslant_with_senti.parquet')
# user_resp.columns, user_resp.shape

  df = df[((df['trump_flag'] == True) | (df['biden_flag'] == True))][df['publication'].isin(publishers)]


(29641, 34)

In [14]:
df.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.2,Unnamed: 0.1.1,Unnamed: 0.1.1.1,tweet_id,conversation_id,author_id,created_at,...,neg_senti,trump_neg,trump_neu,trump_pos,biden_neg,biden_neu,biden_pos,trump_flag,biden_flag,proc_text
0,0,0,0,0,0,0,1356021987630587906,1356021987630587906,759251,2021-01-31 23:30:10+00:00,...,0.005534,0.0,0.0,0.0,0.0,0.0,0.798372,0,1,plans deliver substantive foreign policy remar...
1,1,1,1,1,1,1,1355954084486983683,1355954084486983683,759251,2021-01-31 19:00:20+00:00,...,0.01637,0.0,0.0,0.0,0.0,0.860013,0.0,0,1,The administration American Sign Language inte...
2,2,2,2,2,2,2,1355931104788811779,1355931104788811779,759251,2021-01-31 17:29:02+00:00,...,0.004616,0.0,0.0,0.0,0.0,0.0,0.716897,0,1,A group 10 Republican senators Sunday called t...
3,3,3,3,3,3,3,1355909182881456134,1355909182881456134,759251,2021-01-31 16:01:55+00:00,...,0.625036,0.0,0.0,0.0,0.0,0.489667,0.0,0,1,"John Kerry , ’ climate envoy , climate crisis ..."
4,4,4,4,4,4,4,1355904949872422918,1355904949872422918,759251,2021-01-31 15:45:06+00:00,...,0.415048,0.855193,0.0,0.0,0.0,0.0,0.0,1,0,Former 's impeachment defense attorneys left w...


In [15]:
df.columns

Index(['Unnamed: 0.3', 'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.2',
       'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'tweet_id', 'conversation_id',
       'author_id', 'created_at', 'geo', 'lang', 'like_count', 'quote_count',
       'reply_count', 'retweet_count', 'source', 'text', 'publication',
       'topics', 'topic_labels', 'theme', 'pos_senti', 'neu_senti',
       'neg_senti', 'trump_neg', 'trump_neu', 'trump_pos', 'biden_neg',
       'biden_neu', 'biden_pos', 'trump_flag', 'biden_flag', 'proc_text'],
      dtype='object')