In [131]:
import pandas as pd
from geopy.distance import geodesic

In [2]:
#first we just want to get the topics assigned to each article in our data 

IN_PATH = "/shared/3/projects/newsDiffusion/data/processed/fullDataWithClustNums.tsv"


#we are bringing in the data with demographic information and cluster ids 
#hence, merged
#MVP path merged = pd.read_csv("/shared/3/projects/benlitterer/localNews/data/interim/SingleNE_85_clustered.tsv", sep="\t")
merged = pd.read_csv(IN_PATH, sep="\t")
merged = merged.dropna(subset=["clustNum"])

#look at which outlet first started a story 

#first make sure we have the date of an outlet correctly specified 
merged["date"] = pd.to_datetime(merged["date"])


  exec(code_obj, self.user_global_ns, self.user_ns)


In [157]:
#get our topics in the mix 
docTopics = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/topicModelling/doc_topics.txt", sep="\t")

In [18]:
#get pairwise information
#by sorting in this way then calling drop_duplicates, we only consider the first mention of a story by 
#an outlet. Simplifies the analysis but introduces an assumption 
#example: merged.head(300)[["clustNum", "source", "date"]].sort_values(["clustNum", "source", "date"]).head(20)
pairwise = merged.sort_values(["clustNum", "source", "date"]).drop_duplicates(subset=["clustNum", "source"])

#group into clusters
demVars = ["clustNum", "source"]
pairwise = pairwise.loc[pairwise["national"] == False, ["key"] + demVars]
pairwise = pairwise.groupby(by="clustNum").agg(list)
pairwise["clustSize"] = pairwise["source"].apply(len)

#NOTE: we remove very large clusters? because they slow down getting pairwise information 
pairwise = pairwise[pairwise["clustSize"] < 2000].drop(columns=["clustSize"])

In [19]:
pairwise.head()

Unnamed: 0_level_0,key,source
clustNum,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,"[45426, 44274, 43496, 43909, 12619, 43933, 465...","[albuquerquejournal, aurorasentinel, dailyhera..."
1.0,"[246286, 311389, 311867, 254639, 244441, 25523...","[albuquerquejournal, aurorasentinel, boonville..."
7.0,"[291888, 197081, 7321, 30836, 1771, 2373, 2, 1...","[addisoncountyindependent, advertiserdemocrat,..."
8.0,"[203480, 204268, 203832, 215372]","[lasvegasreviewjournal, thekennebecjournal, th..."
14.0,"[428234, 429463, 430009, 428700]","[albuquerquejournal, aurorasentinel, thelowell..."


In [20]:
def getPairwise(inList):
    outList = []
    for i in range(0, len(inList)-1): 
        for j in range(i + 1, len(inList)): 
            outList.append((inList[i], inList[j]))
    return outList

#applied across an entire series 
def getPairwiseSeries(inSeries): 
    return inSeries.apply(getPairwise)

pairwise["source"] = pairwise["source"].apply(getPairwise)
pairwise["key"] = pairwise["key"].apply(getPairwise)
pairwise = pairwise.explode(["key", "source"])
#pairwise[["date1", "date2"]] = pd.DataFrame(pairwise["date"].tolist(), index=pairwise.index)

In [21]:
pairwise.head()

Unnamed: 0_level_0,key,source
clustNum,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,"(45426, 44274)","(albuquerquejournal, aurorasentinel)"
0.0,"(45426, 43496)","(albuquerquejournal, dailyherald)"
0.0,"(45426, 43909)","(albuquerquejournal, murrysvillestar)"
0.0,"(45426, 12619)","(albuquerquejournal, newyorkobserver)"
0.0,"(45426, 43933)","(albuquerquejournal, pittsburghtribunereview)"


In [12]:
pairwise = pairwise.dropna()
pairwise = pairwise.drop(columns=["key"])

In [23]:
pairwise[["key1", "key2"]] = pd.DataFrame(pairwise["key"].tolist(), index=pairwise.index)

In [25]:
pairwise[["source1", "source2"]] = pd.DataFrame(pairwise["source"].tolist(), index=pairwise.index)

In [30]:
pairwise = pairwise.drop(columns=["key", "source"])
pairwise = pairwise.reset_index()

In [31]:
#now we want to add examples of pairs that didn't occur!
pairwise.head()

Unnamed: 0,clustNum,key1,key2,source1,source2
0,0.0,45426.0,44274.0,albuquerquejournal,aurorasentinel
1,0.0,45426.0,43496.0,albuquerquejournal,dailyherald
2,0.0,45426.0,43909.0,albuquerquejournal,murrysvillestar
3,0.0,45426.0,12619.0,albuquerquejournal,newyorkobserver
4,0.0,45426.0,43933.0,albuquerquejournal,pittsburghtribunereview


In [62]:
#linkDict = dict(zip(pairwise["key1"], pairwise["key2"]))

In [68]:
linkDict = pairwise[["key1", "key2"]].groupby("key1").agg(list).to_dict(orient="index")

In [46]:
allKeys = merged["key"]

In [77]:
N_SAMPLES

646949

In [86]:
import numpy as np
pairwise = pairwise.replace("None", np.nan).dropna()

In [92]:
outletGraph = nx.from_pandas_edgelist(pairwise[["key1", "key2"]], source="key1", target="key2", create_using=nx.Graph)

In [98]:
#we want to add edges every __ number of edges that we come across 
(len(merged["key"]) ** 2 - len(outletGraph.edges())) / len(pairwise)

2626079.9041486597

In [101]:
nonEdges = nx.non_edges(outletGraph)
edgeList = []
for i, nonEdge in enumerate(nonEdges): 
    if i % 2000000:  
        edgeList.append(nonEdge)
    if len(edgeList) == len(pairwise): 
        break

In [112]:
nonMatches = pd.DataFrame(edgeList, columns=["key1", "key2"])
nonMatches["linked"] = 0
nonMatches = pd.merge(nonMatches, merged[["key", "source"]], left_on=["key1"], right_on=["key"])
nonMatches = pd.merge(nonMatches, merged[["key", "source"]], left_on=["key2"], right_on=["key"]).drop(columns=["key_x", "key_y"]).rename(columns={"source_x":"source1", "source_y":"source2"})

In [115]:
pairwise["linked"] = 1

In [137]:
allPairs = pd.concat([pairwise.drop(columns=["clustNum"]), nonMatches])

In [138]:
#TODO: extend to have everything we're interested in 
#we use this to merge demographic variables onto our configuration model network 
outletDf = merged[["source", "white_pct", "lesscollege_pct", "median_hh_inc", "rural_pct", "age65andolder_pct", "total_population", "lat", "lon", "national", "state"]].drop_duplicates(["source"]).drop_duplicates()

outletDf = outletDf[outletDf["national"] == False].drop(columns=["national"])


In [139]:
allPairs.head()

Unnamed: 0,key1,key2,source1,source2,linked
0,45426.0,44274.0,albuquerquejournal,aurorasentinel,1
1,45426.0,43496.0,albuquerquejournal,dailyherald,1
2,45426.0,43909.0,albuquerquejournal,murrysvillestar,1
3,45426.0,12619.0,albuquerquejournal,newyorkobserver,1
4,45426.0,43933.0,albuquerquejournal,pittsburghtribunereview,1


In [140]:
#now try a quick regression accounting for distance as well 
def renameLeft(inStr): 
    if "source" not in inStr and "sharedStories" not in inStr and "key" not in "str": 
        return inStr + "Left"
    return inStr

def renameRight(inStr): 
    if "source" not in inStr and "sharedStories" not in inStr and "Left" not in inStr and "key" not in "str": 
        return inStr + "Right"
    return inStr

#merge in outlet level data 
allPairs = pd.merge(allPairs, outletDf, how="left", left_on="source1", right_on="source").drop(columns=["source"]).rename(renameLeft, axis=1)
allPairs = pd.merge(allPairs, outletDf, how="left", left_on="source2", right_on="source").drop(columns=["source"]).rename(renameRight, axis=1)
allPairs = allPairs.replace("None", np.nan).dropna()

In [141]:
allPairs.head()

Unnamed: 0,key1Left,key2Left,source1,source2,linkedLeft,white_pctLeft,lesscollege_pctLeft,median_hh_incLeft,rural_pctLeft,age65andolder_pctLeft,...,stateLeft,white_pctRight,lesscollege_pctRight,median_hh_incRight,rural_pctRight,age65andolder_pctRight,total_populationRight,latRight,lonRight,stateRight
0,45426.0,44274.0,albuquerquejournal,aurorasentinel,1,39.9337855321091,67.2165170335752,48994,4.19551922531257,14.1614192540647,...,New Mexico,61.9199958553786,59.2703284151326,66288,1.57534138806964,11.8152470259104,617668,39.729432,-104.831919,Colorado
1,45426.0,43496.0,albuquerquejournal,dailyherald,1,39.9337855321091,67.2165170335752,48994,4.19551922531257,14.1614192540647,...,New Mexico,42.8688445407287,63.5288471949041,56902,0.0452963852406551,12.9631616954324,5227575,42.08836,-87.980626,Illinois
2,45426.0,43909.0,albuquerquejournal,murrysvillestar,1,39.9337855321091,67.2165170335752,48994,4.19551922531257,14.1614192540647,...,New Mexico,94.231684275844,73.0941908603275,54142,25.3635987720754,20.6693806225774,359377,40.428401,-79.697545,Pennsylvania
3,45426.0,12619.0,albuquerquejournal,newyorkobserver,1,39.9337855321091,67.2165170335752,48994,4.19551922531257,14.1614192540647,...,New Mexico,47.0705307497482,39.565360372262,75513,0.0,14.3837053337974,1634989,40.712775,-74.005973,New York
4,45426.0,43933.0,albuquerquejournal,pittsburghtribunereview,1,39.9337855321091,67.2165170335752,48994,4.19551922531257,14.1614192540647,...,New Mexico,79.3682336877012,60.9304185701967,54357,2.4862917174835,17.4036867258363,1230360,40.440625,-79.995886,Pennsylvania


In [142]:
#categories to go in our regression 
demCats = ["lesscollege_pct", "white_pct", "median_hh_inc", "rural_pct", "age65andolder_pct", "total_population"]
for cat in demCats: 
    allPairs[cat + "Diff"] = (allPairs[cat + "Right"].astype(float) - allPairs[cat + "Left"].astype(float)).abs()

In [143]:
#allPairs["dist"] = allPairs[["latLeft", "lonLeft", "latRight", "lonRight"]].apply(lambda x: geodesic((x["latLeft"], x["lonLeft"]), (x["latRight"], x["lonRight"])).miles, axis=1)

In [145]:
allPairs.columns

Index(['key1Left', 'key2Left', 'source1', 'source2', 'linkedLeft',
       'white_pctLeft', 'lesscollege_pctLeft', 'median_hh_incLeft',
       'rural_pctLeft', 'age65andolder_pctLeft', 'total_populationLeft',
       'latLeft', 'lonLeft', 'stateLeft', 'white_pctRight',
       'lesscollege_pctRight', 'median_hh_incRight', 'rural_pctRight',
       'age65andolder_pctRight', 'total_populationRight', 'latRight',
       'lonRight', 'stateRight', 'lesscollege_pctDiff', 'white_pctDiff',
       'median_hh_incDiff', 'rural_pctDiff', 'age65andolder_pctDiff',
       'total_populationDiff'],
      dtype='object')

In [147]:
sampDf = allPairs.sample(250000)

In [149]:
sampDf = sampDf.rename(columns={"key1Left":"key1", "key2Left":"key2"})

In [161]:
docTopics.columns = ["index", "key"] + ["Topic" + str(i) for i in range(40)]

In [162]:
docTopics.head()

Unnamed: 0,index,key,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,...,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39
0,1,2283486,0.000322,0.000322,0.018363,0.000322,0.000322,0.000322,0.000322,0.028673,...,0.000322,0.000322,0.010631,0.008054,0.067332,0.000322,0.049291,0.000322,0.000322,0.000322
1,2,1543217,0.0079,0.000465,0.000465,0.048792,0.015335,0.000465,0.000465,0.000465,...,0.000465,0.000465,0.000465,0.130576,0.019052,0.033922,0.000465,0.000465,0.000465,0.0079
2,3,1543216,0.001984,0.001984,0.001984,0.335317,0.001984,0.001984,0.001984,0.001984,...,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984
3,4,1543215,0.000422,0.000422,0.000422,0.60853,0.000422,0.091639,0.017314,0.010557,...,0.000422,0.000422,0.000422,0.003801,0.000422,0.000422,0.000422,0.003801,0.000422,0.138936
4,5,1543214,0.003788,0.003788,0.003788,0.003788,0.003788,0.003788,0.003788,0.003788,...,0.003788,0.003788,0.003788,0.094697,0.003788,0.094697,0.003788,0.003788,0.003788,0.003788


In [164]:
regressDf = pd.merge(sampDf, docTopics, left_on="key1", right_on="key")
regressDf = pd.merge(regressDf, docTopics, left_on="key2", right_on="key")

In [232]:
for i in range(40): 
    regressDf["Topic" + str(i) + "Mean"] = (regressDf["Topic" + str(i) + "_y"] + regressDf["Topic" + str(i) + "_x"]) / 2
    regressDf["Topic" + str(i) + "Diff"] = (regressDf["Topic" + str(i) + "_y"] + regressDf["Topic" + str(i) + "_x"]).abs()

In [233]:
regressDf.columns

Index(['key1', 'key2', 'source1', 'source2', 'linkedLeft', 'white_pctLeft',
       'lesscollege_pctLeft', 'median_hh_incLeft', 'rural_pctLeft',
       'age65andolder_pctLeft',
       ...
       'Topic30Mean', 'Topic31Mean', 'Topic32Mean', 'Topic33Mean',
       'Topic34Mean', 'Topic35Mean', 'Topic36Mean', 'Topic37Mean',
       'Topic38Mean', 'Topic39Mean'],
      dtype='object', length=194)

In [240]:
diffStr = " + ".join(["Topic" + str(i) + "Diff" for i in range(40)])
meanStr = " + ".join(["Topic" + str(i) + "Mean" for i in range(40)])
#regressStr = "linkedLeft ~ (" + diffStr + ") * rural_pctDiff + " + meanStr
regressStr = diffStr

In [241]:
regressStr

'linkedLeft ~ (Topic0Diff + Topic1Diff + Topic2Diff + Topic3Diff + Topic4Diff + Topic5Diff + Topic6Diff + Topic7Diff + Topic8Diff + Topic9Diff + Topic10Diff + Topic11Diff + Topic12Diff + Topic13Diff + Topic14Diff + Topic15Diff + Topic16Diff + Topic17Diff + Topic18Diff + Topic19Diff + Topic20Diff + Topic21Diff + Topic22Diff + Topic23Diff + Topic24Diff + Topic25Diff + Topic26Diff + Topic27Diff + Topic28Diff + Topic29Diff + Topic30Diff + Topic31Diff + Topic32Diff + Topic33Diff + Topic34Diff + Topic35Diff + Topic36Diff + Topic37Diff + Topic38Diff + Topic39Diff) * rural_pctDiff + Topic0Mean + Topic1Mean + Topic2Mean + Topic3Mean + Topic4Mean + Topic5Mean + Topic6Mean + Topic7Mean + Topic8Mean + Topic9Mean + Topic10Mean + Topic11Mean + Topic12Mean + Topic13Mean + Topic14Mean + Topic15Mean + Topic16Mean + Topic17Mean + Topic18Mean + Topic19Mean + Topic20Mean + Topic21Mean + Topic22Mean + Topic23Mean + Topic24Mean + Topic25Mean + Topic26Mean + Topic27Mean + Topic28Mean + Topic29Mean + Topic30M

In [242]:
import statsmodels.formula.api as smf
mod = smf.ols(regressStr, data=regressDf).fit()

In [260]:
pvals = pd.DataFrame(mod.pvalues, columns=["pValues"]).reset_index()
params = pd.DataFrame(mod.params, columns=["params"]).reset_index()

resultsDf = pd.merge(pvals, params, on="index")
#resultsDf["pValues"] = resultsDf["pValues"].astype(float)
#resultsDf["params"] = resultsDf["params"].astype(float)
resultsDf = resultsDf[(resultsDf["index"].str.contains(":")) & (resultsDf["pValues"] <= .5)].sort_values("params")

In [261]:
import re
def getTopicNum(inStr): 
    cleanStr = inStr.split(":")[0]
    return re.sub("[^0-9]", "", cleanStr)

resultsDf["topicNum"] = resultsDf["index"].apply(getTopicNum).astype(int)

In [262]:
topicWords = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/topicModelling/topic_keys.txt", sep="\t", names=["topicNum", "rand", "words"])
topicWords.head()

Unnamed: 0,topicNum,rand,words
0,0,0.125,league season club united players team game ba...
1,1,0.125,cases covid deaths county state health number ...
2,2,0.125,game season team games players play coach foot...
3,3,0.125,water park animals river lake area dog animal ...
4,4,0.125,die der und das den von mit ist nicht sich auf...


In [263]:
resultsDf.head()

Unnamed: 0,index,pValues,params,topicNum
73,Topic31Diff:rural_pctDiff,0.0,-0.011704,31
65,Topic23Diff:rural_pctDiff,3.554707e-40,-0.008513,23
69,Topic27Diff:rural_pctDiff,3.243388e-63,-0.006028,27
62,Topic20Diff:rural_pctDiff,9.225042e-87,-0.003768,20
72,Topic30Diff:rural_pctDiff,8.933682999999999e-26,-0.002731,30


In [265]:
topicsMerged = pd.merge(resultsDf, topicWords, on="topicNum", how="left").drop(columns=["rand"])
topicsMerged.head(10)

Unnamed: 0,index,pValues,params,topicNum,words
0,Topic31Diff:rural_pctDiff,0.0,-0.011704,31,people state masks coronavirus social health d...
1,Topic23Diff:rural_pctDiff,3.554707e-40,-0.008513,23,people world political power america governmen...
2,Topic27Diff:rural_pctDiff,3.243388e-63,-0.006028,27,people time don family n't back life day told ...
3,Topic20Diff:rural_pctDiff,9.225042e-87,-0.003768,20,police man officers year-old county arrested o...
4,Topic30Diff:rural_pctDiff,8.933682999999999e-26,-0.002731,30,fire california san a.m p.m argus/dispatch ang...
5,Topic25Diff:rural_pctDiff,1.551018e-09,-0.002217,25,biden campaign democratic trump party voters s...
6,Topic5Diff:rural_pctDiff,0.1001501,-0.002202,5,government people johnson lockdown london mini...
7,Topic14Diff:rural_pctDiff,7.585089e-15,-0.001881,14,family years church home wife died john funera...
8,Topic3Diff:rural_pctDiff,2.323703e-17,-0.00185,3,water park animals river lake area dog animal ...
9,Topic28Diff:rural_pctDiff,0.002505391,-0.001028,28,food restaurant store farmers meat make custom...
