# Project

Welcome to the group project! The project is based on the [ACM RecSys 2021 Challenge](https://recsys-twitter.com/).

- Detailed information about the task, submission and grading can be found in a [dedicates site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1217340).
- Information about the dataset structure [on this site on TUWEL](https://tuwel.tuwien.ac.at/mod/page/view.php?id=1218810).

In [5]:
team_name = "team_5" # your team name e.g. 'team_1'
team_members = [("Simone Andreetto","01635069"),
                ("Adrian Bracher","01637180"),
                ("Dominik Mailer","01634043"),
                ("Andreas Merckel","00746397"),
                ("Dominik Pülke","12019262"),
                ("Sebastian Scholz","01526884"),
                ("Felix Winterleitner","01612776"),
                ("Ahmadou Wagne","12002293")] # [("Jane Doe","012345678"), ("John Doe","012345678")]


In [6]:
print(team_name)
print(team_members)

team_5
[('Simone Andreetto', '01635069'), ('Adrian Bracher', '01637180'), ('Dominik Mailer', '01634043'), ('Andreas Merckel', '00746397'), ('Dominik Pülke', '12019262'), ('Sebastian Scholz', '01526884'), ('Felix Winterleitner', '01612776'), ('Ahmadou Wagne', '12002293')]


In [7]:
path_to_data = '../shared/data/project/validation/'
dataset_type = 'one_hour' # all_sorted, one_day, one_hour, one_week

# NN-Model

In [8]:
import os
import re
import csv
import datetime
import model_nn
import pandas as pd

from dataprep import import_data
import dataprep
from sklearn.preprocessing import StandardScaler

In [9]:
used_features = [
            # "text_tokens",
            "hashtags",
            # "tweet_id",
            "present_media",
            "present_links",
            "present_domains",
            "tweet_type",
            # "language",
            # "tweet_timestamp",
            # "engaged_with_user_id",
            # "engaged_with_user_follower_count",
            "engaged_with_user_following_count",
            "engaged_with_user_is_verified",
            "engaged_with_user_account_creation",
            # "engaging_user_id",
            # "enaging_user_follower_count",
            # "enaging_user_following_count",
            # "enaging_user_is_verified",
            # "enaging_user_account_creation",
            "engagee_follows_engager"
    ]

all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",\
               "enaging_user_account_creation", "engagee_follows_engager"]

all_features_to_idx = dict(zip(all_features, range(len(all_features))))

def parse_input_line(line):
        
    features = line #.split("\x01")
    
    hashtags = features[all_features_to_idx['hashtags']]
    present_media = features[all_features_to_idx['present_media']]
    present_links = features[all_features_to_idx['present_links']]
    present_domains = features[all_features_to_idx['present_domains']]
    tweet_type = features[all_features_to_idx['tweet_type']]
    engaged_with_user_following_count = features[all_features_to_idx['engaged_with_user_following_count']]
    engaged_with_user_is_verified = features[all_features_to_idx['engaged_with_user_is_verified']]
    engaged_with_user_account_creation = features[all_features_to_idx['engaged_with_user_account_creation']]
    engagee_follows_engager = features[all_features_to_idx['engagee_follows_engager']]
    
    return (hashtags, present_media, present_links, present_domains, tweet_type, engaged_with_user_following_count, engaged_with_user_is_verified, engaged_with_user_account_creation, engagee_follows_engager)


def evaluate_test_set():
    expanded_path = os.path.expanduser(path_to_data)
    part_files = [os.path.join(expanded_path, f) for f in os.listdir(expanded_path) if dataset_type in f]
    part_files = sorted(part_files, key = lambda x:x[-5:]) 
        
    with open('results.csv', 'w') as output:
        for file in part_files:
            with open(file, 'r') as f:
                linereader = csv.reader(f, delimiter='\x01')
                last_timestamp = None
                df = pd.DataFrame(columns=all_features)
                i = 0
                for row in linereader:
                    df.loc[i] = row[:20]
                    i += 1
                df_complete = df.copy()
                df = df.loc[:, used_features]
                df = dataprep.transform_data(df)
                scale = StandardScaler()
                #df = scale.fit_transform(df)
                #df = pd.DataFrame(columns=all_features, data=df)
                df = pd.DataFrame(scale.fit_transform(df.values), columns=df.columns, index=df.index)
                for index, row in df.iterrows():
                    tweet_id = df_complete.iloc[[index]]["tweet_id"]
                    user_id = df_complete.iloc[[index]]["engaging_user_id"]
                    #feature_tuple = parse_input_line(row)   
                    #reply_pred = reply_pred_model(features) # reply_model
                    #retweet_pred = retweet_pred_model(features) # retweet_model
                    #quote_pred = quote_pred_model(features) # pred_model
                    #fav_pred = fav_pred_model(features) # fav_model
                    
                    #print(feature_tuple)
                    #print(df.iloc[[index]]["engaged_with_user_following_count"])
                    reply_pred = model_nn.reply_pred_model(df.iloc[[index]])
                    retweet_pred = model_nn.retweet_pred_model(df.iloc[[index]])
                    quote_pred = model_nn.quote_pred_model(df.iloc[[index]])
                    fav_pred = model_nn.fav_pred_model(df.iloc[[index]])
                    
                    
                    # print(str(tweet_timestamp))
                    #print(str(reply_pred)+" "+str(retweet_pred)+" "+str(quote_pred)+" "+str(fav_pred))
                    output.write(f'{tweet_id.values[0]},{user_id.values[0]},{reply_pred[0]},{retweet_pred[0]},{quote_pred[0]},{fav_pred[0]}\n')

In [10]:
evaluate_test_set()

In [None]:
# hidden


In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit

In [None]:
# feel free to edit