<h1>Ditchley S2DS project August 2020 - Code Pipeline<h1>
    <h2>Team: Adam Hawken, Luca Lamoni, Elizabeth Nicholson, Robert Webster<h2>

In [14]:
#![]() #graphical representation of the pipeline here

<h3>Section 0: Working directory and graph DB setup<h3>
    <h4>0.1: Modules and working directory setup<h4>

In [1]:
# Import modules and set up working directory
import sys
import os
import time
import logging
import json
import csv
import threading
import queue
import asyncio 
import nest_asyncio
nest_asyncio.apply()
import twint
import pandas as pd


# Set up working directory
# The working directory should reflect the structure of the Github repository https://github.com/S2DSLondon/Aug20_Ditchley
sys.path.insert(1, 'C:/Users/Luca/Aug20_Ditchley/')
from src.data import pipeline_setup
pipeline_setup.build_data_dir('C:/Users/Luca/Aug20_Ditchley/')

Data directory & sub-directories already exist, skipping.


<h4>0.2: Initialize graph database<h4> 

In [2]:
# import standard libraries
import numpy as np
import pandas as pd
from py2neo import Graph
from py2neo.data import Node, Relationship
from src.data import graphdb as gdb

# load / declare the database
graph = gdb.get_graph(new_graph = True)
graph
# start with an empty graph
graph.delete_all()

<h3>Section 1: Getting journalist twitter handles according to a keyword<h3>
    <h4>The journalist scraping is performed at the web address https://www.journalism.co.uk/prof/?chunk=0&cmd=default<h4>

In [3]:
# Choose keyword and run the scraping function
from src.data import journalists as journos
keyword = 'cybersecurity'
# Input: string / Output: list
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))
type(journo_handles)

3


list

<h3>Section 2. Scrape user information and friend lists for each journalist in the list<h3>
    <h4>2.1: Scrape user information using the Twitter API<h4> 

In [4]:
#Load twitter API credentials and return a tweepy API instance
import json
import tweepy
from src.data import api_tweepy as api

# Input: path of json file with credentials / Output: tweepy.api.API
tw_api = api.connect_API('../src/data/twitter_credentials.json')

In [5]:
# Scrape user information using the API
from src.data import api_user_tools as api_tools
from src.data import data_cleanup as dc

# Input: tweepy.api.API,list / Output: list
api_users = api_tools.batch_request_user_info(tw_api,journo_handles)
# Input: list / Output: DataFrame
df_api = dc.populate_user_df(api_users)
# Check
df_api.head()

Unnamed: 0,user_id,screen_name,name,location,user_description,user_friends_n,user_followers_n,prof_created_at,favourites_count,verified,statuses_count
0,335773502,_lucyingham,Lucy Ingham,London,Editor of and digital magazines Verdict Magazi...,513,646,2011-07-15 06:29:08,2150,False,452
1,964233746865119233,jesscahaworth,Jessica Haworth,,Cybersecurity journalist at Music buff and ski...,970,670,2018-02-15 20:23:34,453,False,576
2,1186245031507693574,ad_nauseum74,Adam Bannister,,Journalist The Daily Swig Cybersecurity,366,133,2019-10-21 11:38:12,112,False,277


In [6]:
# Save the dataframe as csv
df_api.to_csv('../data/processed/'+keyword+'_user_profiles.csv', index = False)

<h4>2.2: Load user info into graph DB<h4>

In [7]:
# Neo4j import files need to be in a specific folder, however, the csv files saved above are in a different folder, to go around this problem on Windows machines it is
# possible to create a shortcut between the two folders

# lowd in user information
print('Loading in user information and drawing (Person) nodes')
fn_users = 'cybersecurity_user_profiles.csv'
gdb.load_users(fn_users ,graph)

Loading in user information and drawing (Person) nodes


<h4>2.3: Scrape user friend list using Twint<h4> 

In [8]:
# 
from src.data import twint_tools as tt

# define keyword arguments / 'n_retries' = max number of scrape attempts, 'suppress' = hide critical Twint warnings
kwargs = {'n_retries':5,
         'suppress':False}
# Multi threading function Input: _get_friends function, number of threads to distribute the queque, args and kwargs
tt.twint_in_queue(tt._get_friends, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

Attempt #1 to get friends of @_lucyinghamAttempt #1 to get friends of @JesscaHaworth
Attempt #1 to get friends of @Ad_Nauseum74



CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError


Attempt #2 to get friends of @_lucyingham


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Results for @Ad_Nauseum74 saved to: ../data/raw/cybersecurity_friends_Ad_Nauseum74.csv


CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Results for @_lucyingham saved to: ../data/raw/cybersecurity_friends__lucyingham.csv


CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Results for @JesscaHaworth saved to: ../data/raw/cybersecurity_friends_JesscaHaworth.csv


In [9]:
# Concatenate all the individual lists into one dataframe with journalist and its friends
friends_csv = tt.join_friends_csv(journo_handles,keyword) # this function has a bug, the first friend name is 'username'

@_lucyingham follows 513 users.
@JesscaHaworth follows 970 users.
@Ad_Nauseum74 follows 366 users.

Total number of handles pulled: 1849
Number of unique twitter handles: 1709

Zero following in list for users: []


In [12]:
# Save the dataframe as csv
friends_csv.to_csv('../data/processed/'+keyword+'_journalist_friends.csv', index=False)

<h4>2.4: Load friend information into DB<h4> 

In [13]:
# load in friend information
print('Loading in friends info and drawing [FOLLOWS] edges')
fn_friends = 'cybersecurity_journalist_friends.csv'
gdb.load_friends(fn_friends,graph)

Loading in friends info and drawing [FOLLOWS] edges


<h3>Section 3. Loop over selected journalists handles and scrape their tweets (3.1) and mentions (3.2) using Twint<h3>
    <h4>Section 3.1: Scrape tweets using Twint<h4> 

In [16]:
from src.data import twint_tools as tt
# define keyword arguments
kwargs = {'date_range':('2020-08-01 00:00:00', None),
         'n_retries':5,
         'suppress':False}
# multi threading
tt.twint_in_queue(tt._search_tweets_by_user, 3, journo_handles, args=('../data/raw/'+keyword+'_',), kwargs=kwargs)

Attempt #1 to get tweets of @_lucyingham
Attempt #1 to get tweets of @JesscaHaworthAttempt #1 to get tweets of @Ad_Nauseum74



CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Results for @Ad_Nauseum74 saved to: ../data/raw/cybersecurity_tweets_Ad_Nauseum74.csv
Results for @JesscaHaworth saved to: ../data/raw/cybersecurity_tweets_JesscaHaworth.csv
Results for @_lucyingham saved to: ../data/raw/cybersecurity_tweets__lucyingham.csv


In [17]:
# Joined all the individual csv into one dataframe
cyber_test = tt.join_tweet_csv(journo_handles, keyword)
# Check
cyber_test.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1298141120128524288,1298131556251262976,1598335935000,2020-08-25,08:12:15,Romance Daylight Time,335773502,_lucyingham,Lucy Ingham,,...,,,,,,"[{'user_id': '335773502', 'username': '_lucyin...",,,,
1,1298136447258697728,1298131556251262976,1598334821000,2020-08-25,07:53:41,Romance Daylight Time,335773502,_lucyingham,Lucy Ingham,,...,,,,,,"[{'user_id': '335773502', 'username': '_lucyin...",,,,
2,1298015000519487490,1298001949468524549,1598305866000,2020-08-24,23:51:06,Romance Daylight Time,335773502,_lucyingham,Lucy Ingham,,...,,,,,,"[{'user_id': '335773502', 'username': '_lucyin...",,,,
3,1298012906626392067,1298001949468524549,1598305367000,2020-08-24,23:42:47,Romance Daylight Time,335773502,_lucyingham,Lucy Ingham,,...,,,,,,"[{'user_id': '335773502', 'username': '_lucyin...",,,,
4,1296054092234719233,1296054092234719233,1597838349000,2020-08-19,13:59:09,Romance Daylight Time,335773502,_lucyingham,Lucy Ingham,,...,,,,,,"[{'user_id': '335773502', 'username': '_lucyin...",,,,


In [18]:
# Save dataframe as csv
cyber_test.to_csv('../data/processed/'+keyword+'_journalist_tweets.csv', index=False)

<h4>Section 3.2: Extract mentions from Twint dataset<h4> 

In [19]:
from src.data import data_cleanup as dc
# from the twint dataset, extract mentions based on tweet id and save in a separate csv
mentions_twint  = dc.twint_mentions_to_df(cyber_test)
# Check
mentions_twint.head()

Unnamed: 0,tweet_id,mentions
0,1298141120128524288,delafina777
1,1298136447258697728,delafina777
2,1298015000519487490,berenicejbaker
3,1298012906626392067,berenicejbaker
4,1295467814304849920,journalists4bel


In [20]:
# Save the dataframe
mentions_twint.to_csv('../data/processed/' + keyword + '_mentions_twint.csv',index=False)

<h3>Section 4. Loop over selected journalists handles and scrape their tweets (4.1) and mentions (4.2) using Twitter API<h3>
    <h4>Section 4.1: Scrape tweets using Twint ################ I am waiting for Rob function here<h4> 

In [None]:
import json
import tweepy
from src.data import api_tweepy as api
#Load twitter API credentials and return a tweepy API instance
tw_api = api.connect_API('../src/data/twitter_credentials.json')

<h4>Section 4.2: Extract mentions from API tweets<h4> 


<h3>Section 5. Data cleaning and standardization/LDA<h3>
     <h4>Section 5.1: Clean and standardize Twint dataset<h4>

In [None]:
# Standardise the twint output 
from src.data import data_cleanup as dc
#Create the standardized template
test_twint = dc.standard_tweet_dataset_setup()
test_twint
#fill the template
standard_tweet_twint = dc.fill_standard_tweet_dataset_with_twint(test_twint, cyber_test)
# Check
standard_tweet_twint.head()

In [None]:
# Save the dataframe
standard_tweet_twint.to_csv('../data/processed/' + keyword + '_standard_tweets_twint.csv',index=False)

<h4>Section 5.2: Clean and standardize API dataset<h4>

In [None]:
# add here new cleaning function Rob is working on

<h3>Section 6. Create graph database and import twitter data into it<h3>
    <h4>Section 6.1: Import modules and load graph database<h4> 

In [None]:
# import standard libraries
import numpy as np
import pandas as pd
from py2neo import Graph
from py2neo.data import Node, Relationship
from src.data import graphdb as gdb

# load / declare the database
graph = gdb.get_graph(new_graph = True)
graph

<h4>Section 6.2: Load user info into graph DB<h4>

In [None]:
# Neo4j import files need to be in a specific folder, however, the csv files saved above are in a different folder, to go around this problem on Windows machines it is
# possible to create a shortcut between the two folders

# lowd in user information
print('Loading in user information and drawing (Person) nodes')
fn_users = 'cybersecurity_user_profiles.csv'
gdb.load_users(fn_users ,graph)

<h4>Section 6.2: Load friend information into DB<h4> 

In [None]:
# load in friend information
print('Loading in friends info and drawing [FOLLOWS] edges')
fn_friends = 'cybersecurity_journalist_friends.csv'
gdb.load_friends(fn_friends,graph)

<h4>Section 6.3: Load tweet data into DB<h4> 

In [None]:
# load in tweet information
print('Loading in tweets and drawing (Tweet) nodes')
fn_tweets = '/data/processed/cybersecurity_standard_tweets_twint.csv'
gdb.load_tweets(fn_tweets ,graph) 

<h4>Section 6.4: Draw edges between users and their tweets<h4> 

In [None]:
# draw edges between users and their tweets
print('Drawing [POSTS] edges')
gdb.get_posts(graph)


<h4>Section 6.5: Load tweets' mentions<h4> 

In [None]:
# load in mentions information
print('Loading in mentions and drawing [MENTIONS] edges')
fn_mentions = 'cybersecurity_mentions_twint.csv'
gdb.load_mentions(fn_mentions,graph)

<h4>Section 6.6: Run page rank algorithm using [FOLLOWS] [MENTIONS] edges<h4> 

In [None]:
# run Page rank using follower and mention edges
print('running page rank')
nodelist = ['Person','Tweet']
edgelist = ['FOLLOWS','MENTIONS']
page_rank_friends_mentions = gdb.run_pagerank(nodelist,edgelist,graph)

<h4>Section 6.7: Get a weighted random sample from the journalists friends<h4> 

In [None]:
# get a weighted random sample of users
n_sample = 20
fields = ['rank']
exponents = [2]
sample = gdb.get_multiple_weighted_sample(page_rank_friends_mentions,n_sample,fields,exponents)