In [3]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
IN_PATH = "/shared/3/projects/newsDiffusion/data/processed/fullDataWithClustNums.tsv"
df = pd.read_csv(IN_PATH, sep="\t")

df["date"] = pd.to_datetime(df["date"])
df = df.dropna(subset=["date"])


#we can only keep clusters whose first article occured after the 
#first day of local coverage in our data, 4/1
firstLocDate = min(df.loc[df["national"] == False, "date"])
clusteredDates = df[["clustNum", "date"]].groupby(by="clustNum").agg(list).reset_index()
clusteredDates["minDate"] = clusteredDates["date"].apply(lambda x: min(x))
keepClusts = clusteredDates.loc[clusteredDates["minDate"] >= firstLocDate, "clustNum"]

df = df[df["clustNum"].isin(keepClusts)]
len(df)

clustDf = df[["clustNum", "source", "key", "title", "date", "national", "median_hh_inc", "rural_pct", "total_population"]].groupby("clustNum").agg(list)
clustDf["clustSize"] = clustDf["key"].apply(len)
clustDf = clustDf[clustDf["clustSize"] < 3000]
clustDf["duration"] = ((clustDf["date"].apply(max) - clustDf["date"].apply(min)) / np.timedelta64(1, "D")) + 1

def firstDayPercent(inList): 
    dayOne = min(inList)
    return np.mean([day == dayOne for day in inList if day])

clustDf["dayOnePercent"] = clustDf["date"].apply(firstDayPercent)

#apply duration cutoff
twoThirty = clustDf[(clustDf["duration"] >= 2) & (clustDf["duration"] <= 30)]
twoThirty["outletCount"] = twoThirty["source"].apply(lambda x: len(set(x)))

#apply outlet number cutoff 
twoThirty_Two = twoThirty[twoThirty["outletCount"] >= 2]
print(f"percentage kept from twoThirty dataframe: {len(twoThirty_Two) / len(twoThirty)}") 

#apply day 1, "scoop" cutoff 
fourtyPercent = twoThirty_Two[twoThirty_Two["dayOnePercent"] <= .40]
len(fourtyPercent) / len(twoThirty)
fourtyPercent["natPercent"] = fourtyPercent["national"].apply(np.mean)


In [46]:
fourtyLong = fourtyPercent.explode(["source", "key", "title", "date", "national", "median_hh_inc", "rural_pct", "total_population"])
fourtyLong = fourtyLong.reset_index()

In [47]:
#load in topics 
cols = ["index", "key"] + ["Topic" + str(i) for i in range(40)]
docTopics = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/topicModelling/doc_topics.txt", names=cols, sep="\t")

## NOTE: we lose some articles when clusters
possibly because some of our articles were blank when running topic model? 

In [48]:
fourtyLong = pd.merge(fourtyLong, docTopics, on="key", how="inner")

In [None]:
#first question: are scooped stories different than non scooped stories in their topic distribution? 
#get non scooped stories 


In [49]:
fourtyLong.columns

Index(['clustNum', 'source', 'key', 'title', 'date', 'national',
       'median_hh_inc', 'rural_pct', 'total_population', 'clustSize',
       'dayOnePercent', 'duration', 'outletCount', 'natPercent', 'index',
       'Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5', 'Topic6',
       'Topic7', 'Topic8', 'Topic9', 'Topic10', 'Topic11', 'Topic12',
       'Topic13', 'Topic14', 'Topic15', 'Topic16', 'Topic17', 'Topic18',
       'Topic19', 'Topic20', 'Topic21', 'Topic22', 'Topic23', 'Topic24',
       'Topic25', 'Topic26', 'Topic27', 'Topic28', 'Topic29', 'Topic30',
       'Topic31', 'Topic32', 'Topic33', 'Topic34', 'Topic35', 'Topic36',
       'Topic37', 'Topic38', 'Topic39'],
      dtype='object')

In [None]:
#do certain topics evolve more or less rapidly, or last longer or shorter? 
topicCols = ["Topic" + str(i) for i in range(40)]
topicAverages = fourtyLong[["clustNum", "duration"] + topicCols].groupby(by="clustNum").agg(list)

def colMean(inCol): 
    return [np.mean(currList) for currList in inCol]

topicAverages[topicCols] = topicAverages[topicCols].apply(colMean)
topicAverages["duration"] = topicAverages["duration"].apply(lambda x: x[0])

durationCorrs = []
for topicCol in topicCols: 
    durationCorrs.append(np.corrcoef(topicAverages["duration"], topicAverages[topicCol])[0,1])

durCorrDf = pd.DataFrame({"durationCorrs":durationCorrs, "topicNum":list(range(40))})

topicWords = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/topicModelling/topic_keys.txt", sep="\t", names=["topicNum", "rand", "words"])

durCorrDf = pd.merge(durCorrDf, topicWords, on="topicNum").sort_values("durationCorrs").drop(columns=["rand"])
durCorrDf.head()

durCorrDf.tail()

In [104]:
topicAverages = fourtyLong.loc[fourtyLong["natPercent"] == 0, ["clustNum", "duration"] + topicCols].groupby(by="clustNum").agg(list)

def colMean(inCol): 
    return [np.mean(currList) for currList in inCol]

topicAverages[topicCols] = topicAverages[topicCols].apply(colMean)
topicAverages["duration"] = topicAverages["duration"].apply(lambda x: x[0])

durationCorrs = []
for topicCol in topicCols: 
    durationCorrs.append(np.corrcoef(topicAverages["duration"], topicAverages[topicCol])[0,1])

durCorrDf = pd.DataFrame({"durationCorrs":durationCorrs, "topicNum":list(range(40))})

topicWords = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/topicModelling/topic_keys.txt", sep="\t", names=["topicNum", "rand", "words"])

durCorrDf = pd.merge(durCorrDf, topicWords, on="topicNum").sort_values("durationCorrs").drop(columns=["rand"])

In [105]:
durCorrDf.head()

Unnamed: 0,durationCorrs,topicNum,words
13,-0.133336,13,it’s don’t time people we’re that’s make i’m w...
27,-0.100705,27,people time don family n't back life day told ...
8,-0.079396,8,trump president biden white house donald admin...
39,-0.058858,39,vaccine covid virus coronavirus people health ...
20,-0.052375,20,police man officers year-old county arrested o...


In [106]:
durCorrDf.tail()

Unnamed: 0,durationCorrs,topicNum,words
4,0.043151,4,die der und das den von mit ist nicht sich auf...
9,0.064474,9,school students schools university education c...
24,0.066755,24,show film music series star love year story mo...
38,0.110419,38,city county board council public meeting town ...
37,0.175202,37,p.m event community year a.m day online events...


In [108]:
topicAverages = fourtyLong.loc[fourtyLong["natPercent"] == 1, ["clustNum", "duration"] + topicCols].groupby(by="clustNum").agg(list)

def colMean(inCol): 
    return [np.mean(currList) for currList in inCol]

topicAverages[topicCols] = topicAverages[topicCols].apply(colMean)
topicAverages["duration"] = topicAverages["duration"].apply(lambda x: x[0])

durationCorrs = []
for topicCol in topicCols: 
    durationCorrs.append(np.corrcoef(topicAverages["duration"], topicAverages[topicCol])[0,1])

durCorrDf = pd.DataFrame({"durationCorrs":durationCorrs, "topicNum":list(range(40))})

topicWords = pd.read_csv("/shared/3/projects/newsDiffusion/data/interim/topicModelling/topic_keys.txt", sep="\t", names=["topicNum", "rand", "words"])

durCorrDf = pd.merge(durCorrDf, topicWords, on="topicNum").sort_values("durationCorrs").drop(columns=["rand"])

In [109]:
durCorrDf.head()

Unnamed: 0,durationCorrs,topicNum,words
8,-0.125997,8,trump president biden white house donald admin...
12,-0.057864,12,police officers protests protesters city floyd...
31,-0.038028,31,people state masks coronavirus social health d...
20,-0.033725,20,police man officers year-old county arrested o...
2,-0.033216,2,game season team games players play coach foot...


In [110]:
durCorrDf.tail()

Unnamed: 0,durationCorrs,topicNum,words
28,0.048235,28,food restaurant store farmers meat make custom...
32,0.048624,32,health study cancer treatment patients body me...
18,0.050685,18,home time make buy room good hair day made bac...
24,0.057813,24,show film music series star love year story mo...
9,0.068426,9,school students schools university education c...


In [111]:
len(topicAverages)

10773