# Project

Welcome to the group project! The project is based on the [ACM RecSys 2021 Challenge](https://recsys-twitter.com/).

- Detailed information about the task, submission and grading can be found in a [dedicates site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1217340).
- Information about the dataset structure [on this site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1218810).

In [1]:
team_name = "team_5" # your team name e.g. 'team_1'
team_members = [("Simone Andreetto","01635069"),
                ("Adrian Bracher",""),
                ("Dominik Mailer","01634043"),
                ("Andreas Merckel",""),
                ("Dominik Pülke","12019262"),
                ("Sebastian Scholz","01526884"),
                ("Felix Winterleitner","01612776"),
                ("Ahmadou Wagne","12002293")] # [("Jane Doe","012345678"), ("John Doe","012345678")]


In [2]:
print(team_name)
print(team_members)

team_5
[('Simone Andreetto', '01635069'), ('Adrian Bracher', ''), ('Dominik Mailer', '01634043'), ('Andreas Merckel', ''), ('Dominik Pülke', '12019262'), ('Sebastian Scholz', '01526884'), ('Felix Winterleitner', '01612776'), ('Ahmadou Wagne', '12002293')]


In [3]:
path_to_data = '../shared/data/project/training/'
dataset_type = 'one_hour' # all_sorted, one_day, one_hour, one_week

In [4]:
import os
import re
import csv
import datetime

from dataprep import import_data

In [None]:


all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
               "enaging_user_account_creation", "engagee_follows_engager"]

all_features_to_idx = dict(zip(all_features, range(len(all_features))))

def parse_input_line(line):
    features = line #.split("\x01")
    tweet_id = features[all_features_to_idx['tweet_id']]
    user_id = features[all_features_to_idx['engaging_user_id']]
    input_feats = features[all_features_to_idx['text_tokens']]
    tweet_timestamp = features[all_features_to_idx['tweet_timestamp']]
    return tweet_id, user_id, input_feats, tweet_timestamp


def evaluate_test_set():
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
        
    with open('results.csv', 'w') as output:
        for file in part_files:
            with open(file, 'r') as f:
                linereader = csv.reader(f, delimiter='\x01')
                last_timestamp = None
                for row in linereader:
                    tweet_id, user_id, features, tweet_timestamp = parse_input_line(row)   
                    reply_pred = reply_pred_model(features) # reply_model
                    retweet_pred = retweet_pred_model(features) # retweet_model
                    quote_pred = quote_pred_model(features) # pred_model
                    fav_pred = fav_pred_model(features) # fav_model
                    
                    # print(str(tweet_timestamp))
                    # print(str(reply_pred)+" "+str(retweet_pred)+" "+str(quote_pred)+" "+str(fav_pred))
                    
                    output.write(f'{tweet_id},{user_id},{reply_pred},{retweet_pred},{quote_pred},{fav_pred}\n')


# User-To-User Model

In [5]:
# import Model class
%run -i "u2u.py"

In [6]:
#train models
#imported_data = get_imported_data()

#u2u_reply = Predictor("reply", imported_data, dump_to_file=True)
#u2u_retweet = Predictor("retweet", imported_data, dump_to_file=True)
#u2u_quote = Predictor("retweet_with_comment", imported_data, dump_to_file=True)
#u2u_fav = Predictor("like", imported_data, dump_to_file=True)

In [None]:
# load model from stored files
u2u_reply = load("model_content/u2u_reply.joblib")
u2u_retweet = load("model_content/u2u_retweet.joblib")
u2u_quote = load("model_content/u2u_retweet_with_comment.joblib")
u2u_fav = load("model_content/u2u_like.joblib")

In [None]:
# Evaluator class that computes the f1 score. Use in evaluate_test_set!
class Evaluator:
    def __init__(self):
        self.tp = 0
        self.tn = 0
        self.fp = 0
        self.fn = 0
        
    def add(self, interaction, prediction):
        true = 0 if interaction == "" else 1
        if true == prediction and true == 0:
            self.tn += 1
        elif true == prediction and true == 1:
            self.tp += 1
        elif true != prediction and true == 0:
            self.fp += 1
        elif true != prediction and true == 1:
            self.fn += 1
            
    def get_f1(self):
        try:
            return self.tp / (self.tp + 0.5 * (self.fn + self.fp))
        except:
            return -1

In [24]:
# User-To-User Model

path_to_data = '../../shared/data/project/validation/'
dataset_type = 'one_hour' # all_sorted, one_day, one_hour, one_week


all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
               "enaging_user_account_creation", "engagee_follows_engager", "retweet", "reply", "like", "retweet_with_comment"]

all_features_to_idx = dict(zip(all_features, range(len(all_features))))


# adapted to also return the true interaction values
def parse_input_line(line):
    features = line #.split("\x01")
    tweet_id = features[all_features_to_idx['tweet_id']]
    user_id = features[all_features_to_idx['engaging_user_id']]
    input_feats = features[all_features_to_idx['text_tokens']]
    tweet_timestamp = features[all_features_to_idx['tweet_timestamp']]
    retweet = features[all_features_to_idx['retweet']]
    reply = features[all_features_to_idx['reply']]
    like = features[all_features_to_idx['like']]
    retweet_with_comment = features[all_features_to_idx['retweet_with_comment']]

    return tweet_id, user_id, input_feats, tweet_timestamp, retweet, reply, like, retweet_with_comment


#adapted to return the f1 values for each interaction category
def evaluate_test_set():
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
    
    reply_eval = Evaluator()
    retweet_eval = Evaluator()
    quote_eval = Evaluator()
    fav_eval = Evaluator()
    
        
    with open('results.csv', 'w') as output:
        for file in part_files:
            with open(file, 'r') as f:
                linereader = csv.reader(f, delimiter='\x01')
                last_timestamp = None
                for row in linereader:
                    tweet_id, user_id, features, tweet_timestamp, retweet, reply, fav, quote = parse_input_line(row)   
                    reply_pred = u2u_reply.predict(user_id, tweet_id, binary=True) # reply_model
                    retweet_pred = u2u_retweet.predict(user_id, tweet_id, binary=True) # retweet_model
                    quote_pred = u2u_quote.predict(user_id, tweet_id, binary=True) # pred_model
                    fav_pred = u2u_fav.predict(user_id, tweet_id, binary=True) # fav_model
                    
                    reply_eval.add(reply, reply_pred)
                    retweet_eval.add(retweet, retweet_pred)
                    quote_eval.add(quote, quote_pred)
                    fav_eval.add(fav, fav_pred)
                    
                    # print(str(tweet_timestamp))
                    # print(str(reply_pred)+" "+str(retweet_pred)+" "+str(quote_pred)+" "+str(fav_pred))
                    
                    output.write(f'{tweet_id},{user_id},{reply_pred},{retweet_pred},{quote_pred},{fav_pred}\n')
        return reply_eval.get_f1(), retweet_eval.get_f1(), quote_eval.get_f1(), fav_eval.get_f1()




In [25]:
reply_eval, retweet_eval, quote_eval, fav_eval = evaluate_test_set()

print(f"F1-Reply: {reply_eval}")
print(f"F1-Retweet: {reply_tweet}")
print(f"F1-Quote: {reply_quote}")
print(f"F1-Like: {reply_fav}")

(0.0, 0.0, 0.0, 0.0)


# -- End User-To-User Model --

In [None]:

data = import_data(path_to_data+dataset_type)
data

In [None]:
print(" -- unique tweets --")
print(data["tweet_id"].unique().size)
print(" -- engaged users --")
print(data["engaged_with_user_id"].unique().size)
print(" -- engaging users --")
print(data["engaging_user_id"].unique().size)

In [None]:
# hidden


# Metrics Functions



## Recsys Challenge Functions

In [None]:
from sklearn.metrics import average_precision_score, log_loss

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

  
ground_truth = read_predictions(path_to_data + dataset_type) # will return data in the form (tweet_id, user_id, labed (1 or 0))
predictions = read_predictions("predictions.csv") # will return data in the form (tweet_id, user_id, prediction)

rce = compute_rce(predictions, ground_truth)
average_precision = average_precision_score(ground_truth, predictions)

print(rce)
print(average_precision)

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit