# Setup

In [1]:
%%capture
import sys
!{sys.executable} -m pip install py2neo;
import py2neo
import pandas as pd
from scipy import spatial
from collections import defaultdict
pd.set_option('display.max_columns', 500)

In [2]:
# -- Connect to Neo4j
graph = py2neo.Graph('http://neo4j:7474')

In [3]:
# -- Func to yield cursor results.
def yield_record(cursor):
    halt = False
    while not halt:
        try:
            yield cur.next().data()
        except:
            halt = True

## Data Summary

In [4]:
# -- Count the number of comments.
Ncomments = graph.run("""MATCH ()-[r:COMMENTED]->() RETURN count(*)""").next().data()['count(*)']
# -- Get node stats.
cur = graph.run("""MATCH (n)
RETURN
DISTINCT labels(n),
count(*) AS nNodes,
avg(size( (n)-[]-() ) ) as avgDeg,
stdev(size( (n)-[]-() ) ) as stdDeg,
percentileDisc(size( (n)-[]-() ), 0.5) as medDeg, 
min(size( (n)-[]-() ) ) as minDeg,
max(size( (n)-[]-() ) ) as maxDeg""")
# -- Load into pandas for display.
summary = pd.DataFrame(
    [list(record.values()) for record in yield_record(cur)] + [[['COMMENTED'], Ncomments]], 
    columns=['Label', 'N', 'Mean Degree', 'Std. Dev. Degree', 'Median Degree', 'Min. Degree', 'Max. Degree']
)
summary

Unnamed: 0,Label,N,Mean Degree,Std. Dev. Degree,Median Degree,Min. Degree,Max. Degree
0,[ARTICLE],9298,185.409228,315.736664,57.0,1.0,4996.0
1,[USER],301682,5.714411,21.186375,1.0,0.0,1571.0
2,[COMMENTED],1723935,,,,,


Most users wrote a handful of comments, but there are some super users writing 1K+ commentsin the same timespan. Note that there are User nodes with a degree of 0 (i.e., they are not connected articles).
### What articles have the highest degree (i.e., most comments)?

In [5]:
# -- Sort articles by the most incoming comment relationships; return top 5.
cur = graph.run("""MATCH (a:ARTICLE)
WITH a, SIZE(()-[:COMMENTED]->(a)) as cmtCnt
ORDER BY cmtCnt DESC LIMIT 5
MATCH p=()-[:COMMENTED]->(a)
RETURN a, count(p)""")
# -- Load into pandas for display.
pd.DataFrame([{**res['a'], 'Comments': res['count(p)']} for res in yield_record(cur)]) \
    .sort_values('Comments', ascending=False) \
    .set_index('articleID') \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,Comments,byline,connComponent,headline,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
58b0894195d0e0247463875e,4996,By JULIE HIRSCHFELD DAVIS and MICHAEL M. GRYNBAUM,159386,Trump Intensifies Criticism of F.B.I. and Journalists,National,195.696,2017-02-24 19:27:53,Link
5912391b7c459f24986de9ab,4184,By MICHAEL D. SHEAR and MATT APUZZO,167890,Trump Fires Comey Amid Russia Inquiry,National,157.107,2017-05-09 21:48:03,Link
58ebb1437c459f24986d96ed,4014,By DANIEL VICTOR and MATT STEVENS,167890,"Man Is Dragged From a Full Jet, Stirring a Furor",Business,239.803,2017-04-10 16:22:22,Link
591a524d7c459f24986dfc28,3791,By DAVID BROOKS,167890,When a Child Is Leading The World,OpEd,165.153,2017-05-16 01:13:44,Link
5930616f7c459f24986e2e41,3709,By MICHAEL D. SHEAR,167890,Trump Abandoning Global Climate Accord,Foreign,171.479,2017-06-01 18:48:08,Link


### What users have the highest degree (i.e., most comments)?

In [6]:
# -- Sort users by the most outgoing comment relationships; return top 5.
cur = graph.run("""MATCH (u:USER)
WITH u, SIZE((u)-[:COMMENTED]->()) as cmtCnt
ORDER BY cmtCnt DESC LIMIT 5
MATCH p=(u)-[:COMMENTED]->()
RETURN u, count(p)""")
# -- Load into pandas for display.
pd.DataFrame([{**res['u'], 'Comments': res['count(p)']} for res in yield_record(cur)]) \
    .drop(columns=['pagerank', 'connComponent']) \
    .sort_values('Comments', ascending=False) \
    .set_index('userID')

Unnamed: 0_level_0,Comments,userDisplayName,userLocation
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
61986282.0,1571,Phyliss Dalmatian,"Wichita, Kansas"
17374907.0,1235,Blackmamba,Il
47123844.0,1232,Richard Luettgen,New Jersey
47112177.0,1174,manfred m,Bolivia
37475504.0,1157,John Doe,Johnstown


Here are some real power users. Most are using pseudonyms. However, Richard Luettgen was a noted NYT commenter: see [A Tribute to a Prolific Times Commenter](https://www.nytimes.com/2018/12/21/reader-center/a-tribute-to-a-prolific-times-commenter.html).
## How many subgraphs are there?

In [7]:
cur = graph.run("""MATCH (n) 
RETURN distinct(n.connComponent) as partition, count(*) as nNodes 
ORDER by nNodes DESC""")
records = list(yield_record(cur))
print('Subgraphs: {}'.format(len(records)))
df = pd.DataFrame(records).set_index('partition')
df.head()

Subgraphs: 2369


Unnamed: 0_level_0,nNodes
partition,Unnamed: 1_level_1
159386,208774
167890,99666
150435,13
190309,6
217346,6


There are 2 many subgraphs that comprise most of our data. Otherwise there are a few other mini graphs.
## How similar are the two primary subgraphs?
### What's the cosine similarity in the relative count of comments to each news desk?

In [8]:
partitions = {}
# -- For the two primary subgraphs (i.e., partitions)...
for idx in df[df.nNodes > 15].index:
    # -- Return the count of comments to each newskdesk.
    op = """MATCH ()-[r:COMMENTED]-(n:ARTICLE) 
    WHERE n.connComponent = {} 
    RETURN n.newDesk, count(r)""".format(idx)
    # -- Add news desk comment count to the partitions dict.
    cur = graph.run(op)
    records = [rec for rec in yield_record(cur)]
    partitions[idx] = {rec['n.newDesk']: rec['count(r)'] for rec in records}
df1 = pd.DataFrame(partitions).fillna(0).T
df2 = pd.DataFrame(df1.values / df1.sum(axis=1).values[:, None], columns=df1.columns, index=df1.index)
df2

Unnamed: 0,Arts&Leisure,Automobiles,BookReview,Business,Climate,Culture,Dining,EdLife,Editorial,Express,Foreign,Games,Insider,Investigative,Learning,Letters,Magazine,Metro,Metropolitan,NYTNow,National,NewsDesk,Obits,OpEd,Photo,Podcasts,Politics,RealEstate,Science,Smarter Living,Society,SpecialSections,Sports,Styles,Summary,SundayBusiness,TStyle,Travel,Unknown,Upshot,Video,Washington,Weekend,Well
159386,0.003127,0.0,0.001513,0.045408,0.004445,0.012598,0.008019,0.000552,0.068718,0.002682,0.037567,0.008922,0.002965,0.006598,0.018577,0.000778,0.018825,0.012989,0.003275,3.3e-05,0.163603,0.001659,0.001789,0.37119,0.000192,7.3e-05,0.006393,0.003972,0.013778,0.000504,8.6e-05,0.001013,0.012226,0.005383,2.3e-05,0.003154,0.000145,0.003088,0.00292,0.015905,0.000244,0.116943,0.002504,0.015621
167890,0.003314,1.7e-05,0.000837,0.051481,0.0,0.013302,0.011247,0.002019,0.114654,0.0,0.117478,0.006877,0.003416,0.0,0.015518,5.5e-05,0.025351,0.013359,0.000954,0.0,0.183552,0.0,0.0,0.365244,0.0,0.0,0.0,0.002816,0.011994,0.0,0.0,0.0,0.010396,0.001863,2e-05,0.003862,0.0,0.003815,0.000657,0.021299,0.0,0.0,0.003366,0.011237


In [9]:
1 - spatial.distance.cosine(df1.iloc[0, :].fillna(0), df1.iloc[1, :].fillna(0))

0.94123504075479825

The two graphs have a very high cosine similarity based on the news desks that their users are commenting on.
## What are the most influential articles (i.e., what articles have the highest PageRank)?

In [10]:
# -- Collect top 5 articles by PageRank.
cur = graph.run("""MATCH (n:ARTICLE)
RETURN n
ORDER by n.pagerank
DESC LIMIT 5""")
# -- Load into pandas for display.
pd.DataFrame([rec['n'] for rec in yield_record(cur)]) \
    .sort_values('pagerank', ascending=False) \
    .set_index('articleID') \
    .drop('connComponent', axis=1) \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,byline,headline,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
58e4d28e7c459f24986d87c9,By KATHERINE SCHULTEN,Our Eighth Annual Found Poem Student Contest,Learning,272.599,2017-04-05 11:18:34,Link
58ebb1437c459f24986d96ed,By DANIEL VICTOR and MATT STEVENS,"Man Is Dragged From a Full Jet, Stirring a Furor",Business,239.803,2017-04-10 16:22:22,Link
58b0894195d0e0247463875e,By JULIE HIRSCHFELD DAVIS and MICHAEL M. GRYNBAUM,Trump Intensifies Criticism of F.B.I. and Journalists,National,195.696,2017-02-24 19:27:53,Link
5930616f7c459f24986e2e41,By MICHAEL D. SHEAR,Trump Abandoning Global Climate Accord,Foreign,171.479,2017-06-01 18:48:08,Link
591a524d7c459f24986dfc28,By DAVID BROOKS,When a Child Is Leading The World,OpEd,165.153,2017-05-16 01:13:44,Link


Interesting that the top two articles by page rank are apolitical. Also, it's wholesome to see that a poem contest is the most central article (art bringing people together).
## Simple collaborative filtering (i.e., commenters of this article also commented on)

In [13]:
# -- Get a random article to give recommendations for.
cur = graph.run("MATCH (n:ARTICLE) WITH n, rand() as r ORDER BY r RETURN n LIMIT 1")
article = cur.next().data()['n']
# -- Load into pandas for display.
pd.DataFrame([article]) \
    .set_index('articleID') \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,byline,connComponent,headline,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5ab8b69447de81a901217296,By SHANNON DOYNE,159386,Do Other People Care Too Much About Your Post-High School Plans?,Learning,3.7115,2018-03-26 09:00:00,Link


In [14]:
# -- Using the random article above, 
# -- collect other articles that are 2 hops away
# -- (ie., (given_article)<-[:COMMENTED]-(:USER)-[:COMMENTED]->(articles_of_interest))
cur = graph.run("""MATCH (n:ARTICLE {{articleID: '{}'}})-[]-(:USER)-[]-(m:ARTICLE)
WHERE NOT m.headline = '{}'
RETURN m""".format(article['articleID'], article['headlineE']))
# -- Create a dataframe, drop articles with 'Unknown' headliness
df = pd.DataFrame([rec['m'] for rec in yield_record(cur)])
df = df[df.headline != 'Unknown']
# -- Collect a count of potential recommendations.
N = pd.DataFrame(df.groupby('articleID').size().rename('N'))
# -- Merge count of recommendations into 2 hop articles for display.
df1 = df.merge(N, left_on='articleID', right_on='articleID').drop_duplicates().set_index('articleID')
# -- Show top 5 articles by 2 hop connections and break ties with pagerank.
df1.sort_values(['N', 'pagerank'], ascending=False) \
    .head(5) \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,byline,connComponent,headline,newDesk,pagerank,pubDate,webURL,N
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5a9fb845410cf7000162f27d,By CAROLINE CROSSON GILPIN,159386,Should the Voting Age Be Lowered to 16?,Learning,8.01729,2018-03-07 10:00:31,Link,17
5a8fe63410f40f00018c2971,By NATALIE PROULX,159386,Should Teachers Be Armed With Guns?,Learning,36.4021,2018-02-23 10:00:01,Link,13
5a9920cb410cf7000162ed5b,By NATALIE PROULX,159386,Is It Harder to Grow Up in the 21st Century Than It Was in the Past?,Learning,9.41816,2018-03-02 10:00:01,Link,11
5ab3709647de81a901215188,By NATALIE PROULX,159386,What Makes Someone a Great Leader?,Learning,2.72779,2018-03-22 09:00:01,Link,11
5ab0cd9647de81a9012130e2,By NATALIE PROULX,159386,Are We Being Bad Citizens If We Don’t Keep Up With the News?,Learning,5.80189,2018-03-20 09:00:01,Link,8
