In [16]:
import pandas as pd
import numpy as np
import news_rec as nr

  from .autonotebook import tqdm as notebook_tqdm


## Get the News and Behavior Data into Pandas DF

In [2]:
def load_tsv(file, cols):
    '''
    '''
    df = pd.read_table(file,sep="\t", header=None, names=cols)

    return df

In [3]:
# cols
behav_cols = ["impression_id", "user_id","time","history","impressions"]
news_cols = ["news_id","category","sub_category","title","abstract","url","title_entities","abstract_entitites"]
b_df = load_tsv("./data/behaviors.tsv", behav_cols)
news_df = load_tsv("./data/news.tsv", news_cols)

In [19]:
# verify functions match between nb and news_rec
b_df_p = nr.load_tsv("./data/behaviors.tsv", behav_cols)
news_df_p = nr.load_tsv("./data/news.tsv", news_cols)
assert b_df_p.equals(b_df)
assert news_df_p.equals(news_df)

In [20]:
# check the behaviors dataframe
b_df.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [21]:
# check the news dataframe
news_df.head()

Unnamed: 0,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entitites
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [22]:
# get the size of news
news_df.shape

(51282, 8)

In [23]:
# get the size of behaviors
b_df.shape

(156965, 5)

## Distribution of Categories

In [24]:
news_cats = news_df.groupby(["category"])["category"].count()
news_cats

category
autos             1639
entertainment      587
finance           3107
foodanddrink      2551
health            1885
kids                17
lifestyle         2479
middleeast           2
movies             606
music              769
news             15774
northamerica         1
sports           14510
travel            2350
tv                 889
video             2068
weather           2048
Name: category, dtype: int64

In [25]:
news_cats.shape # 17 unique categories

(17,)

In [26]:
# turn categories into a list
cats = list(news_cats.index)
cats

['autos',
 'entertainment',
 'finance',
 'foodanddrink',
 'health',
 'kids',
 'lifestyle',
 'middleeast',
 'movies',
 'music',
 'news',
 'northamerica',
 'sports',
 'travel',
 'tv',
 'video',
 'weather']

In [27]:
# group the categories and subcategories together
news_df.groupby(["category","sub_category"])["category"].count()

category  sub_category         
autos     autosbuying                32
          autoscartech                2
          autosclassics             119
          autoscompact                1
          autosenthusiasts          231
                                   ... 
video     viral                     217
          watch                       1
          wonder                      5
weather   weatherfullscreenmaps       1
          weathertopstories        2047
Name: category, Length: 283, dtype: int64

In [28]:
# get the list of unique subcategories
subcats = news_df.groupby(["sub_category"])["sub_category"].count()
subcats = list(subcats.index)
subcats

['ads-latingrammys',
 'ads-lung-health',
 'advice',
 'animals',
 'autosbuying',
 'autoscartech',
 'autosclassics',
 'autoscompact',
 'autosenthusiasts',
 'autoshybrids',
 'autoslosangeles',
 'autosluxury',
 'autosmidsize',
 'autosmotorcycles',
 'autosnews',
 'autosownership',
 'autospassenger',
 'autosresearch',
 'autosresearchguides',
 'autosreview',
 'autossema',
 'autossports',
 'autossuvs',
 'autostokyo',
 'autostrucks',
 'autosvans',
 'autosvideonew',
 'autosvideos',
 'awards',
 'awardstyle',
 'baseball',
 'baseball_mlb',
 'baseball_mlb_videos',
 'basketball_nba',
 'basketball_nba_videos',
 'basketball_ncaa',
 'basketball_ncaa_videos',
 'basketball_wnba',
 'beverages',
 'boxing',
 'boxing-mma',
 'cardio',
 'career-news',
 'causes',
 'causes-animals',
 'causes-disaster-relief',
 'causes-environment',
 'causes-food-insecurity',
 'causes-green-living',
 'causes-military-appreciation',
 'causes-poverty',
 'celebhub',
 'celebrity',
 'celebritynews',
 'cma-awards',
 'cocktails',
 'comed

## One Hot Encoding

In [30]:
def one_hot(df, col,vals):
    for i, rw in df.iterrows():
        encoding = []
        current_val = rw[col]
        # create one-hot encoding per row
        for v in vals:
            if v == current_val:
                encoding.append(1)
            else:
                encoding.append(0)

        # update the df
        df.at[i, col] = encoding


In [31]:
df_copy = news_df[:20].copy(deep=True)
one_hot(df_copy, "category",cats)


# verify
assert df_copy["category"][0]== [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
assert df_copy["category"][1]== [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
assert df_copy["category"][2]== [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

In [32]:
# update category and subcategory
one_hot(news_df, "category",cats)
one_hot(news_df, "sub_category",subcats)

In [33]:
news_df.head()

Unnamed: 0,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entitites
0,N55528,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [34]:
# verify one-hot encoding is the same as news_rec
nr.one_hot(news_df_p, "category",cats)
nr.one_hot(news_df_p, "sub_category",subcats)

assert news_df_p.equals(news_df)

## Time To Date

In [35]:
b_df["date"] = pd.to_datetime(b_df["time"]).dt.date
b_df["date"]

0         2019-11-11
1         2019-11-12
2         2019-11-14
3         2019-11-11
4         2019-11-12
             ...    
156960    2019-11-14
156961    2019-11-13
156962    2019-11-14
156963    2019-11-13
156964    2019-11-14
Name: date, Length: 156965, dtype: object

## Make History and Candidate Data Frames

In [43]:
# History Will be the users historical articles combined with news relating info
history_df_cols = [ "user_id","date","history"]
candidate_df_cols = [ "user_id","date","impressions"]

history_df = b_df[history_df_cols].copy(deep=True)
candidate_df = b_df[candidate_df_cols].copy(deep=True)

# Later will include news data

In [44]:
candidate_df.head()

Unnamed: 0,user_id,date,impressions
0,U13740,2019-11-11,N55689-1 N35729-0
1,U91836,2019-11-12,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,U73700,2019-11-14,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,U34670,2019-11-11,N35729-0 N33632-0 N49685-1 N27581-0
4,U8125,2019-11-12,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


## Split users history and impressions

In [45]:
def create_rows(df, col):
    '''
    '''
    # first convert string into a list/array
    df[col] = df[col].str.split()

    # explored into multiple rows
    new_df = df.explode(col)
    return new_df


In [46]:
candidate_df = create_rows(candidate_df, "impressions")
candidate_df.head()

Unnamed: 0,user_id,date,impressions
0,U13740,2019-11-11,N55689-1
0,U13740,2019-11-11,N35729-0
1,U91836,2019-11-12,N20678-0
1,U91836,2019-11-12,N39317-0
1,U91836,2019-11-12,N58114-0


In [51]:
history_df = create_rows(history_df, "history")
history_df = history_df.rename(columns={"history":"news_id"}) # rename history column for joining to news df
history_df.head()

Unnamed: 0,user_id,date,news_id
0,U13740,2019-11-11,N55189
0,U13740,2019-11-11,N42782
0,U13740,2019-11-11,N34694
0,U13740,2019-11-11,N45794
0,U13740,2019-11-11,N18445


## Make Label Column

In [47]:
candidate_df[["news_id","label"]] =candidate_df["impressions"].str.split(pat="-", expand=True)
candidate_df.head()

Unnamed: 0,user_id,date,impressions,news_id,label
0,U13740,2019-11-11,N55689-1,N55689,1
0,U13740,2019-11-11,N35729-0,N35729,0
1,U91836,2019-11-12,N20678-0,N20678,0
1,U91836,2019-11-12,N39317-0,N39317,0
1,U91836,2019-11-12,N58114-0,N58114,0


## Join News Data To History and Candidate Dfs

In [48]:
candidate_df = pd.merge(candidate_df,news_df, on ="news_id").drop(columns=
                                                                    ["impressions","abstract","url","title_entities",	"abstract_entitites"])
candidate_df.head()

Unnamed: 0,user_id,date,news_id,label,category,sub_category,title
0,U13740,2019-11-11,N55689,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football..."
1,U8355,2019-11-11,N55689,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football..."
2,U53231,2019-11-11,N55689,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football..."
3,U17841,2019-11-11,N55689,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football..."
4,U91678,2019-11-11,N55689,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football..."


In [52]:
history_df= pd.merge(history_df,news_df, on ="news_id").drop(columns=
                                                                    ["abstract","url","title_entities",	"abstract_entitites"])
history_df.head()

Unnamed: 0,user_id,date,news_id,category,sub_category,title
0,U13740,2019-11-11,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
1,U10045,2019-11-13,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
2,U85394,2019-11-10,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
3,U78244,2019-11-12,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
4,U27024,2019-11-14,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."


## Split Into Training vs Test Data

In [53]:
dates = b_df.groupby(["date"])["date"].count()
dates = dates.to_frame()
dates = dates.rename(columns={"date":"count"})
dates["rel_fre"] = dates["count"]/ b_df.shape[0]
dates

Unnamed: 0_level_0,count,rel_fre
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-11-09,13570,0.086452
2019-11-10,15048,0.095869
2019-11-11,32799,0.208957
2019-11-12,33654,0.214404
2019-11-13,31624,0.201472
2019-11-14,30270,0.192846


In [54]:
# since the last day is almost 20% of the data. Will split training into all days but the last day
train_candidates = candidate_df.loc[candidate_df["date"].astype(str) != "2019-11-14"]
train_histroy = history_df.loc[history_df["date"].astype(str) != "2019-11-14"]

In [55]:
train_dates = train_candidates.groupby(["date"])["date"].count()
assert train_dates.shape[0] == 5

In [56]:
test_candidates = candidate_df.loc[candidate_df["date"].astype(str) == "2019-11-14"]
test_history = history_df.loc[history_df["date"].astype(str) == "2019-11-14"]

test_dates = test_candidates.groupby(["date"])["date"].count()
assert test_dates.shape[0] == 1

## Store in a CSV

In [35]:
test_candidates.to_csv("./data/test_candidates.csv")

In [36]:
test_history.to_csv("./data/test_history.csv")
# might need to chunk it https://stackoverflow.com/questions/63768642/pandas-df-to-parquet-write-to-multiple-smaller-files

In [37]:
train_candidates.to_csv("./data/train_candidates.csv")

In [38]:
train_histroy.to_csv("./data/train_history.csv")

# Make Dataframes for the Web Application

In [74]:
user_file = './data/users.csv'
users = pd.read_csv(user_file)
# get experiment users
sample_users = users["user_id"].to_list()
hist_sample = history_df.loc[history_df["user_id"].isin(sample_users[:10])] 
can_sample = candidate_df.loc[candidate_df["user_id"].isin(sample_users[:10])]# replaced users
news_experiment = hist_sample["news_id"].unique().tolist() + can_sample["news_id"].unique().tolist()

news_df_original = load_tsv("./data/news.tsv", news_cols)
web_app_news_df = news_df_original.loc[news_df_original["news_id"].isin(news_experiment)]
web_app_news_df.head()

Unnamed: 0,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entitites
47,N41387,tv,tv-gallery,Can you answer these real Jeopardy questions a...,"Culling data straight from the ""Jeopardy!"" arc...",https://assets.msn.com/labs/mind/AABs6Gq.html,"[{""Label"": ""Jeopardy!"", ""Type"": ""W"", ""Wikidata...","[{""Label"": ""Jeopardy!"", ""Type"": ""W"", ""Wikidata..."
268,N30344,lifestyle,lifestylebuzz,Snakehead fish that survives on land was disco...,An invasive fish species that can breathe air ...,https://assets.msn.com/labs/mind/AAIzlnB.html,"[{""Label"": ""Georgia (U.S. state)"", ""Type"": ""G""...","[{""Label"": ""Georgia (U.S. state)"", ""Type"": ""G""..."
375,N50299,tv,tv-celebrity,Kelly Ripa responds to backlash over son in 'e...,Kelly RIpa is defending a joke she made about ...,https://assets.msn.com/labs/mind/AAJfUQq.html,"[{""Label"": ""Kelly Ripa"", ""Type"": ""P"", ""Wikidat...","[{""Label"": ""Kelly Ripa"", ""Type"": ""P"", ""Wikidat..."
764,N1644,travel,traveltips,8 Secret Spots You Never Knew Existed in Disne...,Make your next trip even more magical.,https://assets.msn.com/labs/mind/AACrqlJ.html,"[{""Label"": ""Disney Parks, Experiences and Prod...",[]
919,N54822,health,nutrition,"If You Don't Eat a Banana Every Day, This Migh...",An apple a day keeps the doctor away? Not so m...,https://assets.msn.com/labs/mind/AAHyq1v.html,[],[]


In [75]:
# web app news shape
web_app_news_df.shape

(459, 8)

In [76]:
# store web app news
web_app_news_df.to_csv("./data/web_app_news.csv")

In [77]:
# store web app behavior information
b_df_original = load_tsv("./data/behaviors.tsv", behav_cols)
web_app_user_df = b_df_original.loc[b_df_original["user_id"].isin(sample_users[:10])]

In [78]:
web_app_user_df.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
13809,13810,U60170,11/14/2019 10:03:11 AM,N871 N64208 N52536 N4526 N53872 N62058 N4607 N...,N25165-0 N63060-0 N45734-1 N29212-0 N38779-0 N...
36132,36133,U25928,11/12/2019 10:37:35 AM,N45636 N50299 N60384 N19594,N55281-0 N63970-1 N50688-0 N54125-0 N49487-0 N...
42656,42657,U25497,11/14/2019 7:52:25 PM,N30344 N21087 N6233 N40141 N32560 N258 N10814,N41934-0 N52867-1 N56211-0 N27737-0 N61233-0 N...
49087,49088,U9318,11/14/2019 6:58:08 AM,N54822 N848 N52500 N52551 N32004,N50872-0 N23446-1 N40109-0 N51570-0 N38779-0 N...
53927,53928,U66830,11/12/2019 6:04:42 AM,N47954 N10732 N21215 N10897 N22161 N28311 N642...,N31504-0 N36789-0 N59713-0 N60105-0 N63154-1 N...


In [79]:
# web app users
web_app_user_df.shape

(20, 5)

In [80]:
# store web app user
web_app_user_df.to_csv("./data/web_app_users.csv")