In [110]:
# Utils
import os
import re

# DS toolkit
import pandas as pd
!pip install duckdb
import duckdb

# Visualisation
import folium
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim



In [2]:
# Unzip all data to /content/data folder 
!unzip /content/drive/MyDrive/DSTSES/Oliver_preds_1_week/predictions1week.zip
!unzip /content/predictions293031.zip
!unzip /content/presdictions_262728.zip
!mkdir /content/data
!mv /content/*.csv /content/data
PATH = "/content/data/"

Archive:  /content/drive/MyDrive/DSTSES/Oliver_preds_1_week/predictions1week.zip
  inflating: predictions262728.csv   
 extracting: predictions293031.zip   
 extracting: presdictions_262728.zip  
Archive:  /content/predictions293031.zip
  inflating: predictions293031.csv   
Archive:  /content/presdictions_262728.zip
replace predictions262728.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: predictions262728.csv   


In [62]:
def load_data_from_folder(folder_path):
  df = pd.DataFrame()
  for filename in os.listdir(folder_path):
    if(filename.split(".")[-1] == 'csv'):
      print("Reading", folder_path+filename)
      df = pd.concat([df, pd.read_csv(folder_path+filename, lineterminator="\n")])
  print("Dataframe shape:", df.shape)
  return df      

df = load_data_from_folder(PATH)
df_unique_users = df.drop_duplicates('userid')
df.head()

Reading /content/data/predictions262728.csv
Reading /content/data/predictions293031.csv
Dataframe shape: (377534, 18)


#Exploring the predictions - Locations

In [108]:
# Gets all records for a specific user 
def user_profile(df, userid):
  return df[df['userid'] == userid]

bot_th = 0.8
min_tweets = 3

# Select users with average bot confidence > bot_th, more than min_tweets tweets
# and sort by number of tweets for juicy results at the top
q1 = """SELECT userid, COUNT(tweetid) as ntweets, AVG(bot), AVG(no_bot) 
        FROM df 
        GROUP BY userid
        HAVING AVG(bot) > {} AND COUNT(tweetid) > {} 
        ORDER BY ntweets DESC, AVG(bot) DESC""".format(bot_th, min_tweets)

df_users_avgconf = duckdb.query(q1).df()
df_users_avgconf.head()

# Find the user profiles by joning the unique_user table with the above findings
# Also filter out users with no location tag
q2 = """SELECT userid, username, location, "avg(bot)",  "avg(no_bot)", "ntweets"
        FROM df_unique_users
        NATURAL JOIN df_users_avgconf
        WHERE location NOT LIKE 'NaN'
        ORDER BY "avg(bot)" DESC"""

df_bots_high_conf = duckdb.query(q2).df()
df_bots_high_conf.head()

Unnamed: 0,userid,username,location,avg(bot),avg(no_bot),ntweets
0,1057944726870081537,SumyElektrik,sumy,0.963296,0.036704,7
1,2768798843,AnnaTeterina_,Ukraine,0.96219,0.03781,4
2,1066305159184244736,Mr_GamesHater,Украина,0.956695,0.043305,20
3,1507002607704805386,Aleksan10288763,Kyiv,0.933305,0.066695,9
4,1507828648392331268,MudRogue,United State,0.925665,0.074335,16


In [None]:
bot_example = user_profile(df, 1507002607704805386)
bot_example.head(10)

In [126]:
def put_markers(map, df):
    geo_locator = Nominatim(user_agent="NotTheDefaultUserAgent")
    for index, row in df.iterrows():
        location = re.sub("\W+", " ", row['location'])
        if location:
            try:
                location = geo_locator.geocode(location)
            except GeocoderTimedOut:
                continue
            if location:
                popup_string = "🌐: {}\n👤: {}".format(row['location'], row['username'])
                folium.Marker([location.latitude, location.longitude], popup = popup_string).add_to(map)

In [125]:
map = folium.Map(location=[0, 0], zoom_start=2)
#groupByLocdf = groupByLocdf.reset_index()
put_markers(map, df_bots_high_conf)
map.save("bots.html")