# Installs and Imports

In [None]:
! pip install requests
! pip install requests-oauthlib
! pip install wordcloud

## Mounting GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Imports

In [None]:
import requests
import os
import json
import numpy as np
import pandas as pd
import time
import json

# API token environment variables definition

This notebook uses the Twitter API V2 for developers. To use it you must have a developer account.
To open a developer account, and see more details, visit the following URL:
https://developer.twitter.com/en/docs/twitter-api


In [None]:
! export API_KEY = 'YOUR_API_KEY' # replace with your API key
! export API_KEY_SECRET = 'YOUR_API_SECRET' # replace with your API Secret
! export BEARER_TOKEN = 'YOUR_BEARER_TOKEN' # replace with your Bearer token

# Quering our non-trolls users' descriptions

## getting the non-trolls users' ids from IRA dataset

In [None]:
path_to_preprocessed_data_dir = '/content/drive/MyDrive/NLP_And_Social_Dynamics/Data/preprocessed_data' #replace with the path to your joined file directory
user_ids_fname = 'joined_users_dataset.csv' # replace with your joined filename (if necessary)
path_to_users_ids_df = os.path.join(path_to_preprocessed_data_dir, user_ids_fname)

In [None]:
# only non-trolls users:
non_trolls_df = users_ids_df[users_ids_df['target_type'].isna()]
non_trolls_user_ids = non_trolls_df['target_id']
non_trolls_df.head()

In [None]:
# non-troll users unique ids
non_trolls_unique_user_ids = np.unique(non_trolls_user_ids.values)

## API queries functions:

### Query single user description

In [None]:

bearer_token = 'YOUR_BEARER_TOKEN' # replace with your Bearer token

def create_url(**kwargs):
    # List fields are adjustable, options include:
    # created_at, description, owner_id,
    # private, follower_count, member_count,
    list_fields = f"list.fields={kwargs.get('list.fields', 'created_at,follower_count')}"
    # You can replace the ID given with the List ID you wish to lookup.
    id = kwargs.get('users_ids_to_retrieve', [])
    
    url = "https://api.twitter.com/2/lists/{}".format(id)
    return url, list_fields


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2ListLookupPython"
    return r


def connect_to_endpoint(url, list_fields, ids=[]):
    response = requests.request("GET", url, auth=bearer_oauth, params=list_fields)
    # print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}\n{}".format(
                response.status_code, response.text, response
            )
        )
    return response.json()


def get_users_attributes_by_ids(**kwargs):
    url, list_fields = create_url(**kwargs)
    json_response = connect_to_endpoint(url, list_fields)
    
    return json_response

#### usage example

In [None]:
non_trolls_validity_df = pd.DataFrame(columns=['uid', 'isValid', 'errors'])

for idx, non_troll_uid in enumerate(non_trolls_unique_user_ids):
  if non_troll_uid in non_trolls_validity_df['uid'].values:
    continue
  u_api_response = get_users_attributes_by_ids(users_ids_to_retrieve=non_troll_uid)
  errors = u_api_response.get('errors', None)
  if errors is not None:
    is_valid = u_api_response['errors'][0].get('title', '')!='Not Found Error'
  else:
    errors = ''
    is_valid = True
  
  non_trolls_validity_df.loc[idx, :] = [non_troll_uid, is_valid, errors]

### Query description for users in batches:

In [None]:
import requests
import os
import json

bearer_token = 'YOUR_BEARER_TOKEN' # replace with your Bearer token


def create_url(users_ids: np.ndarray = []):
    # Specify the usernames that you want to lookup below
    # You can enter up to 100 comma-separated values.
    usernames = f'usernames={",".join(users_ids)}'
    # usernames = "usernames=TwitterDev,TwitterAPI"
    user_fields = "user.fields=description,created_at"
    # User fields are adjustable, options include:
    # created_at, description, entities, id, location, name,
    # pinned_tweet_id, profile_image_url, protected,
    # public_metrics, url, username, verified, and withheld
    url = "https://api.twitter.com/2/users/by?{}&{}".format(usernames, user_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2UserLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth,)
    return response.status_code, response.json(), 


def batch_users_lookup(**kwargs):
    url = create_url(**kwargs)
    res_status_code, json_response = connect_to_endpoint(url)
    return res_status_code, json_response



#### Usage example
Due to the limits on queries per time periods, the code reads an exsiting dataframe from last runs (if exists, if not, creates an empty one). 
Then, in batches of 100 users (with time delay intervals between each batch), if some users ids are not in the existing dataframe, their descriptions are queried from the twitter API and concatenated to the existing dataframe. At the end of each batch, the dataframe is saved to GDrive to prevent loss of aggregated information. 
If the Twitter API returns an error that indicates the developer reached its temporary query limit, an appropriate error is printed to the console.

In [None]:
path_to_non_trolls_validity_df = '/content/drive/MyDrive/NLP_And_Social_Dynamics/Data/non_trolls_u_description.csv'
if os.path.isfile(path_to_non_trolls_validity_df):
  non_trolls_validity_df = pd.read_csv(path_to_non_trolls_validity_df)
else:  
  non_trolls_validity_df = pd.DataFrame(columns=['uid', 'isValid', 'Repsonse'])

non_trolls_validity_df = non_trolls_validity_df.set_index('uid', verify_integrity=True)

In [None]:
batch_size = 100

for batch_idx in np.arange(0, len(non_trolls_unique_user_ids)+999, batch_size):
  if batch_idx > len(non_trolls_unique_user_ids):
    break

  non_troll_uids = non_trolls_unique_user_ids[batch_idx: batch_idx+batch_size]
  if len(non_troll_uids)>100:
    print(len(non_troll_uids))
  if all([x in non_trolls_validity_df['uid'].values for x in non_troll_uids]):
    continue
  res_status_code, u_api_response = batch_users_lookup(users_ids=non_troll_uids)
  if res_status_code == 429:
    print(f'ran out of requests on UID idx: {batch_idx}')
    non_trolls_validity_df.to_csv(path_to_non_trolls_validity_df)
    break
  errors = u_api_response.get('errors', None)
  if errors is not None:
    uids_not_found = []
    for er in errors:
      uid_not_found = er.get('value', None) if 'could not find user' in er.get('detail', '').lower() else None
      if uid_not_found is not None:
        uids_not_found.append(uid_not_found)
    # print(errors)
    # print(u_api_response)
    if uids_not_found:
      uids_not_found = uids_not_found[0].split(',')
      are_valid = pd.DataFrame({'uid': non_troll_uids, 
                   'isValid': [uid not in uids_not_found for uid in non_troll_uids],
                   'Response': [u_api_response for uid in non_troll_uids]
                   })
      are_valid = are_valid.set_index('uid', verify_integrity=True)
  else:
    are_valid = pd.DataFrame({'uid': non_troll_uids, 
                   'isValid': [True for uid in non_troll_uids],
                 'Response': [u_api_response for uid in non_troll_uids]
                   })
    are_valid = are_valid.set_index('uid', verify_integrity=True)

  non_trolls_validity_df = pd.concat([non_trolls_validity_df,are_valid], axis=0)
  non_trolls_validity_df = non_trolls_validity_df.set_index('uid', verify_integrity=True)
  time.sleep(1)

non_trolls_validity_df = non_trolls_validity_df[~non_trolls_validity_df.index.duplicated(keep='first')]
non_trolls_validity_df.to_csv(path_to_non_trolls_validity_df)

print(f"Reached {len(non_trolls_validity_df)}/{len(non_trolls_unique_user_ids)} users")
if len(non_trolls_validity_df) == len(non_trolls_unique_user_ids):
  print(f"DONE!!")

## Transforming query results to parsed description

In [None]:
df = non_trolls_validity_df.copy()

descriptions_by_ids = {}
created_at_by_ids = {}


for row_idx, row in df.iterrows():
    response_dict = ast.literal_eval(row['Response'])
    u_data = response_dict.get('data', None)
    if u_data is None:
        continue

    for data in u_data:
        d_uid = data['username']
        d_description = data['description']
        d_created_at = data['created_at']
        descriptions_by_ids[int(d_uid)] = d_description
        created_at_by_ids[int(d_uid)] = d_created_at

non_trolls_with_description_df = pd.DataFrame(np.array([list(descriptions_by_ids.keys()), list(descriptions_by_ids.values()), list(created_at_by_ids.values())]).T , columns=['uid', 'description', 'created_at'])


### Calculating users with descriptions fraction and basic statistics

In [None]:
n_total_users = len(non_trolls_validity_df)

df = non_trolls_with_description_df

fig, axis = plt.subplots(figsize=(30,10))
u_with_empty_description = sum([x!=x for x in df['description']])
u_with_non_empty_description = sum([x!='' and x==x for x in df['description']])

bars = axis.barh([0, 1], [u_with_empty_description, u_with_non_empty_description])
axis.set_yticks([0, 1])
axis.set_yticklabels(['#Users with empty description', '#Users with description'], rotation=45, fontsize=15)
for bar in bars:
    label = bar.get_width() / n_total_users
    axis.text(bar.get_width()+1,
              bar.get_y()+ bar.get_height()/2.,
              f"{label:.5f}", ha='center',
              fontsize=20,
              rotation=270)

axis.set_title(f'Non Troll Users with description statistics\nFraction from all Non Troll (#{n_total_users}) users (top of bar)', fontsize=30)

plt.savefig('NonTrollsUsersWithDescriptionsCounts.jpg', dpi=200)
plt.show()

## WordCloud generation for aggregated descriptions:

In [None]:
from wordcloud import WordCloud, STOPWORDS
from typing import *
 
def show_wordcloud_for_text(text_raw_tokens: List[str], keep_stop_words:bool = False):
  comment_words = ''
  stopwords = set(STOPWORDS)
  
  # iterate through the csv file
  for val in df.CONTENT:
      
      # typecaste each val to string
      val = str(val)
  
      # split the value
      tokens = val.split()
      tokens = list(filter(lambda x: x.lower()!='nan', tokens))
      # Converts each token into lowercase
      for i in range(len(tokens)):
          tokens[i] = tokens[i].lower()
      
      comment_words += " ".join(tokens)+" "
  
  wordcloud = WordCloud(width = 800, height = 800,
                  background_color ='white',
                  stopwords = stopwords if not keep_stop_words else None,
                  min_font_size = 10).generate(comment_words)
 
  # plot the WordCloud image                      
  plt.figure(figsize = (8, 8), facecolor = None)
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.tight_layout(pad = 0)

# with stopwords included
show_wordcloud_for_text(df['description'].values.tolist(), keep_stop_words=True)
# with stopwords excluded
show_wordcloud_for_text(df['description'].values.tolist(), keep_stop_words=False)
