In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
path = '../predictions1week/'
bot_prob = 0.9 # threshold for bot accounts
plt.rcParams['figure.figsize'] = [10, 5]

Add exploration functions by Florian

In [None]:
def explore(df: pd.DataFrame, col: str, continuous: bool = True, index: str = ""):
    """ Shows basic stats and plots histogram / bar plot """
    s = df[col] if continuous else df[col].value_counts()
    print(s.describe())
    
    if continuous:
        s.hist(log=True, bins=20)
    else:
        if index:
            s.index = [
                df[df[col] == idx][index].iloc[0]
                for idx in s.index
            ]
        s[:20].plot.bar(log=True)
    # plt.show()
    
def plot_timestamps(s: pd.Series):
    """ Plot timestamp data (per day) """
    s_count = s.apply(
        lambda date: date.split()[0]
    ).value_counts()
    s_count = s_count.sort_index()
    print(s_count)
    s_count.plot.line()
    plt.show()
    
def plot_scatter(df: pd.DataFrame, x: str, y: str):
    """ Plots scatter plot """
    df.plot.scatter(x, y, logx=True, logy=True, alpha=.002)
    # plt.show()

In [None]:
def concat_locations(df_source: pd.DataFrame, df_return: pd.DataFrame, loc_list = list[str]):
    '''append accounts with locations from loc_list in df_source to df_return'''
    for loc in loc_list:
        temp = df_source[df_source['location'] == loc]
        df_return = pd.concat([df_return, temp ])
    print(df_return.shape)
    return df_return

## Load dataset

In [None]:
df = pd.DataFrame()

for filename in os.listdir(path):
    if filename[-4:] == ".csv":
        print(filename)
        temp = pd.read_csv(path + filename, lineterminator = '\n')
        df = pd.concat([ df, temp ])

print(df.shape)
df.head()

## Distribution of bot prediction labels for accounts

In [None]:
explore(df.drop_duplicates('userid'), 'bot')

Let's check out the bot accounts 

In [None]:
df_bottweets = df[df['bot'] > bot_prob]
print(df_bottweets.shape)
df_botaccounts = df_bottweets.drop_duplicates('userid')
print(df_botaccounts.shape)
df_botaccounts.head()

### Basic plots for bot accounts

Followers/following: 

In [None]:
explore(df_botaccounts, 'followers', continuous = False)

In [None]:
explore(df_botaccounts, 'following', continuous = False)

Tweets per bot accounts

In [None]:
df_bottweets.groupby(['userid'])['tweetid'].count().plot(x = 'number of tweets', kind = 'hist', logy = True)
plt.show()

Compare for all accounts

In [None]:
df.groupby(['userid'])['tweetid'].count().plot(x = 'number of tweets', kind = 'hist', logy = True)

Drop tweets with same text

In [None]:
df_bottweets.groupby(['userid'])['text'].count().sort_values(ascending = False)

In [None]:
# Most active bot (it just spams the same tweet with minimal changes)
df_most_active_bot = df_bottweets[df_bottweets['userid'] == 1387324144929419265]
print(df_most_active_bot.shape)
df_most_active_bot.head()

Plot creation dates of bot accounts

In [None]:
plot_timestamps(df_botaccounts['usercreatedts'])

In [None]:
plot_timestamps(df['usercreatedts'])

## Locations of bot accounts

In [None]:
df_botaccounts.groupby(['location'])['userid'].count().sort_values(ascending = False)

In [None]:
# df_botaccounts['location'].value_counts().plot(x = 'location', kind = 'bar')
# plt.rcParams['figure.figsize'] = [100, 10]
# plt.show()
print(df_botaccounts.drop_duplicates(['location'])['location'].tolist())

In [None]:
# Extract bot accounts from Ukraine, Russia and USA - not complete!
df_ukrainebots = pd.DataFrame()
df_ukrainebots = concat_locations(
    df_bottweets, 
    df_ukrainebots, 
    ['Kyiv', 'Ukraine', 'Kiev', 'Україна', 'Украина', 'Kharkiv. Ukraine', 
     'kyiv, ukraine ', 'Kyiv, Ukraine', 'Україна, Київ', 'Ukrania', 'Україна, Ірпінь']
)

df_russiabots = pd.DataFrame() # Actually seem to be Ukrainian accounts judging from the content
df_russiabots = concat_locations(
    df_bottweets, 
    df_russiabots, 
    ['Russia, the DADDYland', 'Russia']
)

df_usabots = pd.DataFrame()
df_usabots = concat_locations(
    df_bottweets,
    df_usabots, 
    ['United states ', 'United States 🇺🇸 ','United States',
     'United State', 'PALM SPRING CALIFORNIA^ARCORE^','Los Angeles, CA',
     'North Carolina, USA', 'Oregon, USA', 
     'Texas, United States.', 'Texas, USA', 'Texas, Austin', 
     'Murica', 'Texas'])

df_ukrainebots.shape

## Download subsets for exploration of content

In [None]:
# sorting for easier legibility
df_bottweets = df_bottweets.sort_values('userid', ascending = False)
df_botaccounts = df_botaccounts.sort_values('location', ascending = False)
df_ukrainebots = df_ukrainebots.sort_values('userid', ascending = False)
df_russiabots = df_russiabots.sort_values('userid', ascending = False)
df_usabots = df_usabots.sort_values('userid', ascending = False)

In [None]:
df_botaccounts.to_csv(path + '/subsets/predictions262728_botaccounts.csv')
df_bottweets.to_csv(path + '/subsets/predictions262728_bottweets.csv')
df_ukrainebots.to_csv(path + '/subsets/predictions262728_botsukraine.csv')
df_most_active_bot.to_csv(path + '/subsets/predictions262728_most_active_bot.csv')
df_russiabots.to_csv(path + '/subsets/predictions262728_russiabots.csv')
df_usabots.to_csv(path + '/subsets/predictions262728_usabots.csv')