In [3]:
import numpy as np
import pandas as pd

In [65]:
#function that prints some general stats of a dataframe
def print_stats_df(df, name=''):
    n_tweets = len(df)
    print('GENERAL STATS '+name)
    print('number of tweets:', n_tweets)
    print('number of unique media:', df['present_media'].nunique() )
    print('number of unique language:', df['language'].nunique() )
    print('number of unique EWUF users:', df["engaged_with_user_id"].nunique() )
    print('number of unique EUF users:', df["enaging_user_id"].nunique() )

    labels = ["reply_timestamp","retweet_timestamp","retweet_with_comment_timestamp","like_timestamp"]
    is_y = all([label in df.columns for label in labels])
    if is_y:
        print('\nNUMBER OF ENGAGEMENTS '+name)
        print('number of replies:',n_tweets - df['reply_timestamp'].isna().sum(), (n_tweets - df['reply_timestamp'].isna().sum())/n_tweets)
        print('number of retweets:', n_tweets - df['retweet_timestamp'].isna().sum(), (n_tweets - df['retweet_timestamp'].isna().sum() )/n_tweets )
        print('number of retweet comments:',n_tweets - df["retweet_with_comment_timestamp"].isna().sum(), (n_tweets - df["retweet_with_comment_timestamp"].isna().sum())/n_tweets)
        print('number of likes',n_tweets - df['like_timestamp'].isna().sum(), (n_tweets - df['like_timestamp'].isna().sum())/n_tweets)
    print('------------------------------------------------------------')  
    print()    


In [75]:
from sklearn.preprocessing import StandardScaler
import re

class TwitterTransform:
    """takes a df and cleans it to be used as input of a model in matrix form"""
    
    def __init__(self, lang_tol=1e-2):

        self.ss = StandardScaler()
        self.lang_tol = lang_tol
        self.list_languages = []
        self.selection_to_standarize = ["engaged_with_user_follower_count","engaged_with_user_following_count",
                                        "enaging_user_follower_count", "enaging_user_following_count"]
        self.selection_to_boolean = ["engaged_with_user_is_verified", "enaging_user_is_verified", 
                                     "engagee_follows_engager"]
        self.tweet_types =  ['Retweet', 'Quote', 'Reply', 'Toplevel']
        self.labels = ["reply_timestamp","retweet_timestamp","retweet_with_comment_timestamp","like_timestamp"]

    def fit(self, df):
        ##X
        #STANDARIZATION OF FOLLOWERS COUNTS(fit)
        self.ss.fit(df[self.selection_to_standarize])
        
        #Create lisf of languages
        language_series = df.language.value_counts()/len(df)
        self.list_languages = language_series[language_series > self.lang_tol].tolist()
        print(len(self.list_languages))

        
    def transform(self,df):
        #STANDARIZATION OF FOLLOWERS COUNTS(transform)
        x = df[self.selection_to_standarize]
        x_cols = x.columns
        x = pd.DataFrame(data = self.ss.transform(x), columns = x_cols)
               
        #OneHot encoding of Languages
        for lang in self.list_languages:
            x[lang] = df['language'].apply(lambda x: 1 if (x==lang) else 0)
        
        #OneHot encoding tweet_type
        for types in self.tweet_types:
            x[types] = df['tweet_type'].apply(lambda x: 1 if (x==types) else 0) 
#         x = pd.concat([x, pd.get_dummies(df['tweet_type'])], axis=1)
                
        #OneHot encoding present_media
        x['present_media'] = df['present_media'].apply(lambda x: re.split('\t', x)  if (type(x)==str) else [] )
        for media in ['Photo','Video','GIF']:
            x[media] =  x.present_media.apply(lambda x:  1 if (media  in x) else 0)
        x = x.drop('present_media', axis=1)
                   
        #(True/False)Boolean to (1/0)Boolean
        x = pd.concat( [ x, df[self.selection_to_boolean].astype(int)], axis=1 )
         
        ##Y
        #NUMBER/NaN or BOOLEAN TO INTEGER BOOLEAN(1/0) 
        is_y = all([label in df.columns for label in self.labels])
        if is_y:
            y = df[self.labels].notna().astype(int)
            return x.values, y.values
        else:
            return x.values
                                          
                                
                                          
                                        

In [70]:
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0


In [71]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score, log_loss
from xgboost import XGBClassifier

def model_scores(x_train, y_train, x_test, y_test, model='default'):
#     list = ['replay','retweet', 'retweet_comment', 'like']

    if model=='default':
        model = XGBClassifier()
    model.fit(x_train,y_train)

    ytrue = y_train
    ypred = model.predict(x_train)
    train = [log_loss(ytrue, ypred), roc_auc_score(ytrue, ypred)]
    train = np.array(train)
 
    ytrue = y_test
    ypred = model.predict(x_test)
    test = [log_loss(ytrue, ypred), roc_auc_score(ytrue, ypred)]
    test = np.array(test)
 
    return train, test
#     print('\n'.join([' '.join([name[:5]]+['{:.4f}'.format(score) for score in engage]) for engage,name in zip(train,engagement_list)]))


def print_scores(matrix, name='METRICS'):
    engagement_list = ['replay      ','retweet      ', 'retweet_comment     ', 'like      ']
    score_list = ['log_loss', 'roc_auc']
    print(name+' log_loss   roc_auc')
    print('\n'.join([' '.join([name[:10]]+['{:.4f}'.format(score) for score in engage]) for engage,name in zip(matrix,engagement_list)]))
    print()
    return

In [72]:
df_og = pd.read_csv('sample_train.csv', header=None)
df_og.columns =["text_ tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",
               "enaging_user_id", "enaging_user_follower_count", "enaging_user_following_count", "enaging_user_is_verified",
               "enaging_user_account_creation", "engagee_follows_engager","reply_timestamp","retweet_timestamp","retweet_with_comment_timestamp","like_timestamp"]

print('number of tweets:', len(df_og))


df_og.head()

number of tweets: 1000000


Unnamed: 0,text_ tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,...,enaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t14120\t131\t120\t120\t188\t119\t11170\t12...,,BFB529DAB6D384EB83E899A72AB3830D,Photo,,,TopLevel,B9175601E87101A984A50F8A62A1C374,1581018910,1F30A85E612A24B426013978ADB7AE46,...,000005C520010F8917EEAB6F5B6EC1C4,17,77,False,1569692352,True,,,,
1,101\t62342\t10858\t54439\t19571\t22480\t7831\t...,,519078C7834E9642508F72A6C2D0F3B7,,788ABAA22F1C4A9D186510C1E3E81F58,892B6E0BC037800B59112CFCB95E3301,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1581244866,9E1FD934C376903895DEEBCC0F6AB920,...,000006829BEADA9EEA695CF0C334B426,2,29,False,1568107028,False,,,,1581247000.0
2,101\t58955\t10898\t103305\t1901\t16181\t7168\t...,F0F2FBE57F08E7C4326682B5EDA63E3E\tE78674D32346...,52AAE9E33EFAC8C478C57B31A9E31ED1,,C06B84A10A75AE4BF4A102AB45CC2ABF,C0F5328C1D36CD4B34D1808012E18D46,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1581250124,BEB3C337B266E2C4769065AD64724329,...,000006829BEADA9EEA695CF0C334B426,3,30,False,1568107028,False,,,,
3,101\t2435\t5656\t2594\t8279\t8623\t1925\t64126...,,89C1298C55EB3D68E2784F0BFB69E6F8,,,,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1580958805,9E1FD934C376903895DEEBCC0F6AB920,...,000006829BEADA9EEA695CF0C334B426,3,30,False,1568107028,False,,,,
4,101\t56898\t137\t16349\t11530\t11274\t12396\t1...,C38F925CF76111352C3FE0EFBA8F32B6\t7E881DBFB094...,D1F22992B720504FF058991B83A4C4D8,,2E2690EDB8C577AB6B54D0CD6AE94AC9,3896E26D12C903F0A00B6B1BE9A9BEA3,Retweet,22C448FF81263D4BAF2A176145EE9EAD,1581335128,E4CF13FFB0C7B8AC747476D57543C413,...,000006829BEADA9EEA695CF0C334B426,3,29,False,1568107028,False,,,,


In [73]:
df_val = pd.read_pickle('val_1.pkl')
df_val.head()

Unnamed: 0,text_ tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,enaging_user_id,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,enaging_user_account_creation,engagee_follows_engager
0,101\t11056\t11440\t10124\t169\t11617\t88667\t4...,,BA69C2DDCEB522438A5CB2DD6582031C,,5CEA74D6F92CC55EEDAC520B9AF8E9CC,B63E1DA6A10D81124EF3F884D21358D5,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,2020-02-19 19:00:33,DB2F016B4BFC63DF241227BC80D08526,7038783,338,True,2009-04-11 23:10:42,6DE748062A9C750A47BBF6307F4A8766,11,31,False,2018-08-09 17:49:17,False
1,101\t56898\t137\t33787\t11630\t10738\t131\t113...,82E4D311215329FFE150E47BCF6201CF,3FA16D9944D5378AC7E0A9E6A1CC6C5A,,,,Retweet,D3164C7FBCF2565DDF915B1B3AEFB1DC,2020-02-17 20:05:03,3EB3E6B3C5161E35268890931DCDC69E,1098657,5015,True,2007-10-16 18:29:30,6DE74BB5059920C6EA83DC1DF7F0B900,249,868,False,2013-03-17 22:23:25,False
2,101\t56898\t137\t92728\t10240\t28927\t11227\t1...,,CA6390C6BA03E8C59EB1FAFA655F4F9D,,,,Retweet,06D61DCBBE938971E1EA0C38BD9B5446,2020-02-18 14:21:50,D29A5FD6ADDED13A13B5A6F14AB7B30D,34529,117,False,2007-03-30 14:18:54,6DE74BB5059920C6EA83DC1DF7F0B900,249,868,False,2013-03-17 22:23:25,False
3,101\t26486\t10104\t11782\t15117\t41300\t10110\...,53A2ED224C2CF94043F6177DA61D155A\t6353AFEBBE39...,198028C5748EE0AAF6E3189A47156EAB,Photo,61B2BF457647D8500CE09B2D9E54DD0A,9F5B1567B95FFE817C501C8033102A30,TopLevel,06D61DCBBE938971E1EA0C38BD9B5446,2020-02-19 15:01:40,5F999A72DB67461EDBEB89C7A0E6FA7B,37516,201,True,2011-08-19 09:33:15,6DE74F3B11024E83A618ECF5EF1661BD,269,437,False,2019-05-17 11:22:09,False
4,101\t14120\t131\t120\t120\t188\t119\t11170\t12...,,83499AC12EC821F456F363CD8E29E8EE,Photo\tPhoto,,,TopLevel,B9175601E87101A984A50F8A62A1C374,2020-02-13 16:28:25,DBF5C63773C21A4391E087C48AF57332,3694,3474,False,2019-10-17 21:17:19,6DE75096C4817C0F96CCFAF2BA4F6D8B,19,29,False,2020-01-09 18:59:26,True


In [76]:
tt = TwitterTransform()
tt.fit(df_og)
tt.list_languages
x_train,y_train = tt.transform(df_og)
x_val = tt.transform(df_val)

x_train.shape, y_train.shape, x_val.shape

11


((1000000, 25), (1000000, 4), (1000000, 25))

In [86]:
# df_train = select_features(df_og)

# df_val = select_features(pd.read_pickle('val_1.pkl'), with_labels=False)

# training = pd.read_pickle('training_1e6_0.pkl')
# training.columns = df_og.columns
# training = select_features(training)

#validation =  pd.read_pickle('validation_1e5_0.pkl')
# validation.columns = df_og.columns
# validation = select_features(validation)

df_train = df_og
df_val = pd.read_pickle('val_1.pkl')
df_train = pd.read_pickle('train_1.pkl')

df_train.shape, df_val.shape, validation.shape

((1500000, 24), (1000000, 20), (100000, 23))

In [90]:
df_train[:int(1e6)].shape

(1000000, 24)

In [84]:
print_stats_df(df_train, name='Train')
print_stats_df(df_val, name='Validation')

GENERAL STATS Train
number of tweets: 1500000
number of unique media: 11
number of unique language: 65
number of unique EWUF users: 765959
number of unique EUF users: 1286378

NUMBER OF ENGAGEMENTS Train
number of replies: 39876 0.026584
number of retweets: 166863 0.111242
number of retweet comments: 11273 0.007515333333333333
number of likes 656844 0.437896
------------------------------------------------------------

GENERAL STATS Validation
number of tweets: 1000000
number of unique media: 8
number of unique language: 63
number of unique EWUF users: 550382
number of unique EUF users: 772096
------------------------------------------------------------



 Imbalance treatment for retweets+comments and replies

In [64]:
engage = ['reply','retweet', 'retweet_with_comment',
       'like']
added = '_timestamp'
tot_bicoins = 0
for i in range(3):
    for j in range(i+1,4):
        string = '({})+({}) #coincidences:{}'
        coins = df_train[ df_train[engage[i]+added].notna() & df_train[engage[j]+added].notna()].shape[0]
        tot_bicoins = tot_bicoins + coins
        print( string.format(engage[i], engage[j], coins))
        
print('total bicoincidences:', tot_bicoins)


(reply)+(retweet) #coincidences:2391
(reply)+(retweet_with_comment) #coincidences:350
(reply)+(like) #coincidences:10521
(retweet)+(retweet_with_comment) #coincidences:7750
(retweet)+(like) #coincidences:57395
(retweet_with_comment)+(like) #coincidences:3158
total bicoincidences: 81565


In [93]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import RUSBoostClassifier,BalancedRandomForestClassifier,EasyEnsembleClassifier


def submission(df_train, df_val):
    df_submission = df_val[['tweet_id','enaging_user_id']]
#     df_submission = df_val[['tweet_id','engaged_with_user_id']]
    df_submission = df_submission.set_index('tweet_id')
    
    
    tt = TwitterTransform()
    tt.fit(df_train)
    x_train,y_train = tt.transform(df_train)
    x_val = tt.transform(df_val)
    
    
    eng_list = ['reply','rt','rt+comment','like']
    model_list = [RUSBoostClassifier(),'',RUSBoostClassifier(),'']
    prep_list = ['']*4
#     model_list = [RUSBoostClassifier()]*4
    
    for i,eng,prep,model in zip(range(4),eng_list,prep_list,model_list):
        print(i,eng,' MODEL:',model,' Preprocessing:',prep)
        df_submission['prediction'] =  give_result(x_train, y_train[:,i], x_val, preprocessing=prep, model=model)
        
        df_submission.to_csv('submissions/'+eng+'.csv', header=False)
    

    return 'SUBMISSION READY TO UPLOAD'

from xgboost import XGBClassifier

def give_result(x_train, y_train, x_val, preprocessing='default', model='default'):
    if type(model)== str or model==0:
        model = XGBClassifier()
        
    if type(preprocessing)!= str:
        x_train,y_train = preprocessing.fit_resample(x_train,y_train)
    
    model.fit(x_train,y_train)
    y_pred = model.predict(x_val)

    return y_pred


submission(df_train[:int(1e6)], df_val[:int(1e5)])

11
0 reply  MODEL: RUSBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None, replacement=False,
                   sampling_strategy='auto')  Preprocessing: 
1 rt  MODEL:   Preprocessing: 
2 rt+comment  MODEL: RUSBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None, replacement=False,
                   sampling_strategy='auto')  Preprocessing: 
3 like  MODEL:   Preprocessing: 


'SUBMISSION READY TO UPLOAD'

![image.png](attachment:image.png)

1.     model_list = [RUSBoostClassifier(),'',RUSBoostClassifier(),'']

2. prep_list = ['']*4

![image.png](attachment:image.png)
    
    

1. model_list = [RUSBoostClassifier(),'',RUSBoostClassifier(),'']

2. prep_list = ['']*4

3. train 1e6, val 1e5

![image.png](attachment:image.png)


1. model_list = [RUSBoostClassifier(),'',RUSBoostClassifier(),'']

2. prep_list = ['']*4

3. train 1e6, val 1e5

4. using 'engaged_with_user_id' IT DOESNT FUCKING GO AS EXPECTED INITIALLY. NICE



1. Rus (seed=24)


![image.png](attachment:image.png)


2. RUSBOOST

![image.png](attachment:image.png)


3. EasyEnsemble

![image.png](attachment:image.png)

4. RandomForestClassifier

![image.png](attachment:image.png)

In [95]:
test = pd.read_csv('submissions/like.csv', header=None)

In [96]:
test.head()

Unnamed: 0,0,1,2
0,BA69C2DDCEB522438A5CB2DD6582031C,DB2F016B4BFC63DF241227BC80D08526,0
1,3FA16D9944D5378AC7E0A9E6A1CC6C5A,3EB3E6B3C5161E35268890931DCDC69E,0
2,CA6390C6BA03E8C59EB1FAFA655F4F9D,D29A5FD6ADDED13A13B5A6F14AB7B30D,0
3,198028C5748EE0AAF6E3189A47156EAB,5F999A72DB67461EDBEB89C7A0E6FA7B,0
4,83499AC12EC821F456F363CD8E29E8EE,DBF5C63773C21A4391E087C48AF57332,0
