# Project

Welcome to the group project! The project is based on the [ACM RecSys 2021 Challenge](https://recsys-twitter.com/).

- Detailed information about the task, submission and grading can be found in a [dedicates site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1217340).
- Information about the dataset structure [on this site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1218810).

In [1]:
team_name = "team_5" # your team name e.g. 'team_1'
team_members = [("Simone Andreetto","01635069"),
                ("Adrian Bracher",""),
                ("Dominik Mailer","01634043"),
                ("Andreas Merckel",""),
                ("Dominik Pülke",""),
                ("Sebastian Scholz","01526884"),
                ("Felix Winterleitner",""),
                ("Ahmadou Wagne","12002293")] # [("Jane Doe","012345678"), ("John Doe","012345678")]


In [2]:
print(team_name)
print(team_members)

team_5
[('Simone Andreetto', '01635069'), ('Adrian Bracher', ''), ('Dominik Mailer', ''), ('Andreas Merckel', ''), ('Dominik Pülke', ''), ('Sebastian Scholz', ''), ('Felix Winterleitner', ''), ('Ahmadou Wagne', '12002293')]


In [20]:
path_to_data = '../shared/data/project/training/'
dataset_type = 'one_hour' # all_sorted, one_day, one_hour, one_week

In [17]:
import os
import re
import csv
import datetime

from model import reply_pred_model, retweet_pred_model, quote_pred_model, fav_pred_model 
from dataprep import import_data

In [4]:


all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
               "enaging_user_account_creation", "engagee_follows_engager"]

all_features_to_idx = dict(zip(all_features, range(len(all_features))))

def parse_input_line(line):
    features = line #.split("\x01")
    tweet_id = features[all_features_to_idx['tweet_id']]
    user_id = features[all_features_to_idx['engaging_user_id']]
    input_feats = features[all_features_to_idx['text_tokens']]
    tweet_timestamp = features[all_features_to_idx['tweet_timestamp']]
    return tweet_id, user_id, input_feats, tweet_timestamp


def evaluate_test_set():
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
        
    with open('results.csv', 'w') as output:
        for file in part_files:
            with open(file, 'r') as f:
                linereader = csv.reader(f, delimiter='\x01')
                last_timestamp = None
                for row in linereader:
                    tweet_id, user_id, features, tweet_timestamp = parse_input_line(row)                                                           
                    reply_pred = reply_pred_model(features) # reply_model
                    retweet_pred = retweet_pred_model(features) # retweet_model
                    quote_pred = quote_pred_model(features) # pred_model
                    fav_pred = fav_pred_model(features) # fav_model
                    
                    # print(str(tweet_timestamp))
                    # print(str(reply_pred)+" "+str(retweet_pred)+" "+str(quote_pred)+" "+str(fav_pred))
                    
                    output.write(f'{tweet_id},{user_id},{reply_pred},{retweet_pred},{quote_pred},{fav_pred}\n')


In [5]:
expanded_path = os.path.expanduser(path_to_data)
part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
part_files = sorted(part_files, key = lambda x:x[-5:]) 
part_files

In [7]:
evaluate_test_set()

['../shared/data/project/training/one_hour']

In [23]:

data = import_data(path_to_data+dataset_type)
data

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,...,engaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,retweet,reply,like,retweet_with_comment
0,101\t100\t100\t100\t100\t100\t100\t100\t100\t1...,,395A05A1E8A0A4CEB2E623281C7A41EE,-1,,,0,0,1614207600,55F619B7474C1BA0C8EE03C2A31C534C,...,736278C2FEC488516CDA4ED6952A2154,1139,1126,False,1601425426,False,False,False,False,True
1,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,0,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,0,1,1614207600,9B49D384D56A65E00A12D8349EB46CE5,...,19D5367D835484236CAF9DBEF475FF7A,82,76,False,1495813718,False,False,False,False,False
2,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,0,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,0,1,1614207600,9B49D384D56A65E00A12D8349EB46CE5,...,40BEB04CF8D3CB02449879668656FFDB,108,351,False,1506038593,False,False,False,False,False
3,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,0,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,0,1,1614207600,9B49D384D56A65E00A12D8349EB46CE5,...,6415C94D3C27BA84C069DE049EBB3EDE,69,106,False,1562533868,False,False,False,False,False
4,101\t100\t100\t100\t100\t100\t216\t216\t10243\...,,81E8247F4E74A0FCDBA911E1A3CB5412,0,758E6F75A253992C7070F6B8A8A891A6,6B7D92057ACA0F97EFB5B724D3C963E4,0,1,1614207600,9B49D384D56A65E00A12D8349EB46CE5,...,7E614D5881BC18768880CC374C4BE821,24,180,False,1302057914,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
891742,101\t74212\t106877\t62281\t62281\t62281\t36535...,,FDD2D7997110248166FB05C2E8609696,-1,,,1,4,1614211199,37F269A3339E8ABF2B43B49DD9E13369,...,88A1D59A5ACB3CD87AFBCB58AFB33579,278,435,False,1487800905,True,False,False,False,False
891743,101\t786\t31898\t773\t63289\t10502\t10289\t206...,,BB1139C86AAB39557087F9B8BEA3D6A3,0,07329593C9B929B9DC18A6E064BD9A98,707DC8D88110112FE1095E1C604758D7,0,9,1614211199,B8E7C8A003C8029246951612112971A7,...,7D704E0ECA4D89F47460049DADEBBD65,366,118,False,1585454135,True,False,False,False,False
891744,101\t789\t26341\t10502\t763\t51554\t775\t61566...,,D03C8A65EA0A1197483C9D9051F49BD4,-1,,,0,9,1614211199,E003E05842A204AE468B4414DF1438E1,...,58ED1F75F517E65F96C352C42C33E308,92,254,False,1595185930,False,False,False,False,False
891745,101\t89275\t75416\t11537\t110923\t36175\t10067...,,24166BF0D0D45F2D811606E3417AE975,-1,,,0,1,1614211199,AC97A410ACCA49F55222D8AF5D86CA80,...,1B1C8C719C1BA40D5EE7DF756ACDF0AD,966,1775,False,1292977043,True,False,False,False,False


In [24]:
print(" -- unique tweets --")
print(data["tweet_id"].unique().size)
print(" -- engaged users --")
print(data["engaged_with_user_id"].unique().size)
print(" -- engaging users --")
print(data["engaging_user_id"].unique().size)

 -- unique tweets --
462701
 -- engaged users --
366489
 -- engaging users --
723876


In [6]:
# hidden


In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit