In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
from scipy import stats
import os
import os.path
import csv
import news_rec as nr

  from .autonotebook import tqdm as notebook_tqdm


# Load the News and User Data

In [2]:
hist, can = nr.all_preprocessing_final()

(80380, 8) (121226, 9)


In [3]:
hist.head()

Unnamed: 0,user_id,date,news_id,category,sub_category,title,category_str,sub_category_str
13,U58552,2019-11-11,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O...",tv,tvnews
24,U14239,2019-11-14,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O...",tv,tvnews
32,U18359,2019-11-13,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O...",tv,tvnews
57,U31481,2019-11-14,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O...",tv,tvnews
98,U3216,2019-11-13,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O...",tv,tvnews


In [4]:
# find users reading history count
user_hist = hist.groupby(["user_id"]).size().reset_index(name='counts')
user_hist.head()

Unnamed: 0,user_id,counts
0,U10003,8
1,U10010,5
2,U10024,4
3,U10034,16
4,U10038,7


In [7]:
# find readers over the threshold
user_hist["min"] = user_hist["counts"] * .75
user_hist.head()

Unnamed: 0,user_id,counts,min
0,U10003,8,6.0
1,U10010,5,3.75
2,U10024,4,3.0
3,U10034,16,12.0
4,U10038,7,5.25


## Get Majority of News Readers

In [8]:
user_news = hist[hist["category_str"] =="news"]

In [9]:
user_news = user_news.groupby(["user_id"]).size().reset_index(name='counts')

In [10]:
all_news_users = pd.merge(user_news,user_hist , on = ["user_id","counts"])
all_news_users.shape # 20 or less had 151

(196, 3)

In [11]:
all_news_users.head()

Unnamed: 0,user_id,counts,min
0,U10353,1,0.75
1,U10413,2,1.5
2,U10865,2,1.5
3,U10910,9,6.75
4,U11178,2,1.5


In [12]:
all_news_users_min = pd.merge(user_news,user_hist , on = ["user_id"])
all_news_users_min = all_news_users_min[all_news_users_min["counts_x"] >= all_news_users_min["min"]]
all_news_users_min.head()

Unnamed: 0,user_id,counts_x,counts_y,min
15,U10140,13,16,12.0
29,U10353,1,1,0.75
34,U10390,3,4,3.0
35,U10413,2,2,1.5
54,U10731,12,16,12.0


In [13]:
all_news_users_min.shape # 20 or less had 392 

(519, 4)

## Get Majority of Sports Readers

In [14]:
# sports
user_sports = hist[hist["category_str"] =="sports"]
user_sports = user_sports.groupby(["user_id"]).size().reset_index(name='counts')
all_sp_users = pd.merge(user_sports,user_hist , on = ["user_id","counts"])
all_sp_users.shape

(77, 3)

In [15]:
all_sp_users_min =  pd.merge(user_sports,user_hist , on = ["user_id"])
all_sp_users_min = all_sp_users_min[all_sp_users_min["counts_x"] >= all_sp_users_min["min"]]
all_sp_users_min.head()

Unnamed: 0,user_id,counts_x,counts_y,min
2,U10049,6,8,6.0
18,U10363,4,5,3.75
27,U10575,8,10,7.5
39,U10770,12,16,12.0
59,U11247,2,2,1.5


In [16]:
all_sp_users_min.shape

(193, 4)

## News Had 500 Users

In [17]:
all_news_users_min.head()

Unnamed: 0,user_id,counts_x,counts_y,min
15,U10140,13,16,12.0
29,U10353,1,1,0.75
34,U10390,3,4,3.0
35,U10413,2,2,1.5
54,U10731,12,16,12.0


In [18]:
hist[hist["user_id"]=="U10390"]

Unnamed: 0,user_id,date,news_id,category,sub_category,title,category_str,sub_category_str
69751,U10390,2019-11-13,N43142,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Former NBA first-round pick Jim Farmer arreste...,sports,basketball_nba
349220,U10390,2019-11-13,N11855,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",New Jersey woman sues waiter for spilling wine...,news,newsus
568865,U10390,2019-11-13,N37663,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Woman accused of embezzling from Camp Fire vic...,news,newscrime
568866,U10390,2019-11-13,N37663,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Woman accused of embezzling from Camp Fire vic...,news,newscrime


In [19]:
can[can["user_id"]=="U10390"]

Unnamed: 0,user_id,date,news_id,label,category,sub_category,title,category_str,sub_category_str
335075,U10390,2019-11-13,N33885,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",One of America's biggest solar panel makers qu...,finance,finance-companies
807261,U10390,2019-11-13,N13579,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Chris Paul feels he got 'stabbed in the back' ...,sports,basketball_nba
812688,U10390,2019-11-13,N42977,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",'It's not over': Sarah Palin says she is fight...,news,newsus
1165054,U10390,2019-11-13,N47061,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",105 Black Friday Deals You Can Start Shopping ...,lifestyle,shop-holidays
1897672,U10390,2019-11-13,N17059,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",No. 1 milk company declares bankruptcy amid dr...,finance,finance-companies
1911741,U10390,2019-11-13,N48063,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Cherry disappointed with MacLean's apology: 'H...,sports,icehockey_nhl
2252219,U10390,2019-11-13,N36659,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bolivia's Jeanine Anez declares herself acting...,news,newsworld
2708954,U10390,2019-11-13,N35170,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Atlanta college student Alexis Crawford was ch...,news,newscrime
3268098,U10390,2019-11-13,N29065,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Today in History: November 13,news,newsworld
3381715,U10390,2019-11-13,N64138,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Greta Thunberg to set sail back to Europe soon...,news,newsworld


## Save the Users Needed for experiments 4 and 5

In [20]:
# save news users
exp4_users = all_news_users_min["user_id"].unique()

user_file = './data/users_exp4.csv'
if os.path.isfile(user_file):
        print("File already exists")
        users = pd.read_csv(user_file)
else:
      users = pd.DataFrame(data={"user_id":exp4_users})
      users.to_csv(user_file, sep=',', index=False)
users.head()

File already exists


Unnamed: 0,user_id
0,U10140
1,U10353
2,U10390
3,U10413
4,U10731


In [21]:
sample_users = users["user_id"].to_list()
hist_sample = hist.loc[hist["user_id"].isin(sample_users)] # replaced users
can_sample = can.loc[can["user_id"].isin(sample_users)]

In [22]:
hist_sample.shape, can_sample.shape # ((3653, 8), (7239, 9))

((3653, 8), (7239, 9))

In [23]:
# remove the str cols
hist_sample.head()
hist_sample = hist_sample.drop('category_str', axis=1)
hist_sample = hist_sample.drop('sub_category_str', axis=1)
hist_sample.head()

Unnamed: 0,user_id,date,news_id,category,sub_category,title
1602,U9556,2019-11-12,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
4811,U9556,2019-11-11,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
4865,U54692,2019-11-14,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
5060,U79175,2019-11-09,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."
5802,U25001,2019-11-12,N55189,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","'Wheel Of Fortune' Guest Delivers Hilarious, O..."


## remove dups

In [24]:
hist_sample['RN'] = hist_sample.sort_values(['date'], ascending=[False]).groupby(['user_id',"news_id"]).cumcount() + 1
    #b_df.loc[b_df["user_id"] == 'U79549'].sort_values(['time']) # check work
#b_df= b_df.loc[b_df["RN"] == 1] 
#hist_sample = 
hist_sample = hist_sample.loc[hist_sample['RN'] == 1]

In [25]:
can_sample['RN'] = can_sample.sort_values(['date'], ascending=[False]).groupby(['user_id',"news_id"]).cumcount() + 1

#hist_sample = 
can_sample = can_sample.loc[can_sample['RN'] ==1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  can_sample['RN'] = can_sample.sort_values(['date'], ascending=[False]).groupby(['user_id',"news_id"]).cumcount() + 1


In [26]:
can_sample.head()

Unnamed: 0,user_id,date,news_id,label,category,sub_category,title,category_str,sub_category_str,RN
242,U16011,2019-11-11,N55689,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football...",sports,football_nfl,1
277,U25823,2019-11-11,N55689,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football...",sports,football_nfl,1
460,U91571,2019-11-11,N55689,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football...",sports,football_nfl,1
482,U46954,2019-11-11,N55689,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football...",sports,football_nfl,1
554,U21432,2019-11-11,N55689,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Charles Rogers, former Michigan State football...",sports,football_nfl,1


## Start the Experiment

In [27]:
# experiment 4
nn_final = nr.rec_any(can_sample, hist_sample , 3, "cosine","NN",None, 1,True) # 1 min to run
nr.dict_to_csv(nn_final, "./results/final/nn_cosine3_final_news.csv")

User Status (array([0], dtype=int64),) / (519,)
User Status (array([1], dtype=int64),) / (519,)
User Status (array([2], dtype=int64),) / (519,)
User Status (array([3], dtype=int64),) / (519,)
User Status (array([4], dtype=int64),) / (519,)
User Status (array([5], dtype=int64),) / (519,)
User Status (array([6], dtype=int64),) / (519,)
User Status (array([7], dtype=int64),) / (519,)
User Status (array([8], dtype=int64),) / (519,)
User Status (array([9], dtype=int64),) / (519,)
User Status (array([10], dtype=int64),) / (519,)
User Status (array([11], dtype=int64),) / (519,)
User Status (array([12], dtype=int64),) / (519,)
User Status (array([13], dtype=int64),) / (519,)
User Status (array([14], dtype=int64),) / (519,)
User Status (array([15], dtype=int64),) / (519,)
User Status (array([16], dtype=int64),) / (519,)
User Status (array([17], dtype=int64),) / (519,)
User Status (array([18], dtype=int64),) / (519,)
User Status (array([19], dtype=int64),) / (519,)
User Status (array([20], dtype

In [38]:
# experiment 5
sts, nli = nr.get_models()
sts_final = nr.rec_any(can_sample, hist_sample , 3, "cosine","STS",sts, 1,True) # 2 mins per 100
nr.dict_to_csv(sts_final, "./results/final/sts_cosine3_final_news.csv")

User Status (array([0], dtype=int64),) / (519,)
User Status (array([1], dtype=int64),) / (519,)
User Status (array([2], dtype=int64),) / (519,)
User Status (array([3], dtype=int64),) / (519,)
User Status (array([4], dtype=int64),) / (519,)
User Status (array([5], dtype=int64),) / (519,)
User Status (array([6], dtype=int64),) / (519,)
User Status (array([7], dtype=int64),) / (519,)
User Status (array([8], dtype=int64),) / (519,)
User Status (array([9], dtype=int64),) / (519,)
User Status (array([10], dtype=int64),) / (519,)
User Status (array([11], dtype=int64),) / (519,)
User Status (array([12], dtype=int64),) / (519,)
User Status (array([13], dtype=int64),) / (519,)
User Status (array([14], dtype=int64),) / (519,)
User Status (array([15], dtype=int64),) / (519,)
User Status (array([16], dtype=int64),) / (519,)
User Status (array([17], dtype=int64),) / (519,)
User Status (array([18], dtype=int64),) / (519,)
User Status (array([19], dtype=int64),) / (519,)
User Status (array([20], dtype