Copyright (c) Snap Inc. 2020. This sample code is made available by Snap Inc. for informational purposes only. It is provided as-is, without warranty of any kind, express or implied, including any warranties of merchantability, fitness for a particular purpose, or non-infringement. In no event will Snap Inc. be liable for any damages arising from the sample code or your use thereof.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import hashlib
import spacy
import os
import re
import json
from collections import OrderedDict
from operator import itemgetter
from spacy.lang.en.stop_words import STOP_WORDS
import string
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBClassifier
import sys
sys.path.append('../')

from tf_idf_vectorizer import *
from utils.snap_preprocessed_df_handle import *
from utils.EstimatorSelectionHelper import EstimatorSelectionHelper
from utils.classifier_setup import *

# BERT Classification
from sentence_transformers import SentenceTransformer
from scipy.stats import pearsonr, spearmanr
import pickle

In [None]:
TRAIN_PATH = '../../data/dataframes/df_unique_with_similarity.pkl'
TEST_PATH = '../../data/dataframes/df_test_unique_with_similarity.pkl'

In [None]:
def prepare_dataframe_tf_idf(PATH):
    df_with_keywords = get_dataframe(PATH)
    articles = get_unique_combined_with_id(df_with_keywords, 'Input.article', 'article')
    od_output, od_keys = get_tf_idf(articles, 'article', preprocessor=preprocessor, stop_words=stop_list, ngram_range = (1,1))
    df_with_keywords['tfidf_v1'] = df_with_keywords['id1'].apply(lambda x: od_output[list(od_keys).index(x)])
    df_with_keywords['tfidf_v2'] = df_with_keywords['id2'].apply(lambda x: od_output[list(od_keys).index(x)])
    df_with_keywords['tfidf_similarity'] = df_with_keywords[['tfidf_v1','tfidf_v2']]\
                                        .apply(lambda row: cosine_similarity(row['tfidf_v1'],row['tfidf_v2'])[0][0], axis=1)
    
    return df_with_keywords

In [None]:
train_df = prepare_dataframe_tf_idf(TRAIN_PATH)

In [None]:
test_df = prepare_dataframe_tf_idf(TEST_PATH)

In [None]:
train_df.drop(columns=['k1','k2','textrank_similarity','tfidf_v1','tfidf_v2'], inplace=True)
test_df.drop(columns=['k1','k2','textrank_similarity','tfidf_v1','tfidf_v2'], inplace=True)

# Setup

In [None]:
articles_train = pd.concat([train_df[['id1','Input.article1']].\
                            rename(columns={'id1':'id','Input.article1':'article'}), \
                            train_df[['id2','Input.article2']].\
                            rename(columns={'id2':'id','Input.article2':'article'})]
                          ).drop_duplicates().reset_index(drop=True)
non_dup_articles_train = articles_train['id'].drop_duplicates().index
articles_train = articles_train.loc[non_dup_articles_train].reset_index(drop=True)

articles_test = pd.concat([test_df[['id1','Input.article1']].\
                            rename(columns={'id1':'id','Input.article1':'article'}), \
                            test_df[['id2','Input.article2']].\
                            rename(columns={'id2':'id','Input.article2':'article'})]
                          ).drop_duplicates().reset_index(drop=True)

non_dup_articles_test = articles_test['id'].drop_duplicates().index
articles_test = articles_test.loc[non_dup_articles_test].reset_index(drop=True)

In [None]:
with open('../../data/dataframes/roberta_sentence_embeddings_train.pkl', 'rb') as f:
    sentence_embeddings_train = pickle.load(f)

In [None]:
with open('../../data/dataframes/roberta_sentence_embeddings_test.pkl', 'rb') as f:
    sentence_embeddings_test = pickle.load(f)

In [None]:
articles_train['roberta_embedding'] = sentence_embeddings_train
articles_test['roberta_embedding'] = sentence_embeddings_test

In [None]:
train_df['roberta_embedding1'] = train_df['id1'].\
                                apply(lambda x: articles_train[articles_train['id']==x]\
                                      ['roberta_embedding'].values[0])
train_df['roberta_embedding2'] = train_df['id2'].\
                                apply(lambda x: articles_train[articles_train['id']==x]\
                                      ['roberta_embedding'].values[0])
train_df['roberta_similarity'] = train_df[['roberta_embedding1','roberta_embedding2']]\
                                        .apply(lambda row: \
                                               cosine_similarity(row['roberta_embedding1'].reshape(1, -1),\
                                                                 row['roberta_embedding2'].reshape(1, -1))[0][0], axis=1)

In [None]:
test_df['roberta_embedding1'] = test_df['id1'].\
                                apply(lambda x: articles_test[articles_test['id']==x]\
                                      ['roberta_embedding'].values[0])
test_df['roberta_embedding2'] = test_df['id2'].\
                                apply(lambda x: articles_test[articles_test['id']==x]\
                                      ['roberta_embedding'].values[0])
test_df['roberta_similarity'] = test_df[['roberta_embedding1','roberta_embedding2']]\
                                        .apply(lambda row: \
                                               cosine_similarity(row['roberta_embedding1'].reshape(1, -1),\
                                                                 row['roberta_embedding2'].reshape(1, -1))[0][0], axis=1)

# Creating Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class MergedTransform( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, weight_factor=0.5):
        self._weight_factor = weight_factor
        
    #Return self nothing else to do here
    def fit( self, X, y=None):
        return self

    #Transform method we wrote for this transformer 
    def transform(self, X, y = None):
       #Depending on constructor argument break dates column into specified units
       #using the helper functions written above 
        tf_idf_factor = self._weight_factor
        bert_factor = 1-self._weight_factor
        X['merged_similarity'] = tf_idf_factor * X['tfidf_similarity'] + bert_factor * X['roberta_similarity']
        return X

In [None]:
class EstimatorSelectionHelperWrapper(BaseEstimator):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, models, params, y_label='majority_same_event'):
        self._helper = EstimatorSelectionHelper(models, params)
        self.y_label = y_label
        
    #Return self nothing else to do here
    def fit( self, X, y=None):
        self._helper.fit(X['merged_similarity'].values.reshape(-1, 1),
            X[self.y_label],
            cv = 5,
            scoring=make_scorer(custom_scorer, greater_is_better=True), n_jobs=16, refit=True)
        return self

    #Transform method we wrote for this transformer 
    def predict(self, X, y = None):
       #Depending on constructor argument break dates column into specified units
       #using the helper functions written above 
        self._helper.summary(X['merged_similarity'], X[self.y_label])
        
    def save_models(self,path,name):
        self._helper.save_models(path,name)
        
    def save_helper(self, path, name):
        with open(path+name, 'w') as f:
            pickle.dump(self,f)        

In [None]:
models = {
           "XGBoost" : XGBClassifier()
}

params = {'XGBoost':  {"colsample_bytree": [0.3,0.5,0.8,1],"gamma":[0,10,50,100],
                        "max_depth": [2,4,6], # default 3\
                        "n_estimators": [50,100], # default 100
                        "subsample": [0.3,0.5,0.8,1]}
}

def custom_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

# Event Classification

In [None]:
classifiers_event = {}

In [None]:
for weight_factor in np.arange(0.1,1.0,0.2):
    models = {
           "XGBoost" : XGBClassifier()}
    print('------------------------------------------------')
    print('Running for Weight Factor - ', weight_factor)
    transform = MergedTransform(weight_factor=weight_factor)
    wrapper_event = EstimatorSelectionHelperWrapper(models, params, y_label='majority_same_event')
    wrapper_event.fit(transform.transform(train_df))
    classifiers_event[weight_factor] = wrapper_event

In [None]:
for key in classifiers_event:
    print('====================================================')
    print('Running for Weight Factor - ', key)
    classifiers_event[key].predict(transform.transform(test_df))
#     classifiers_event[key].save_models('../../data/models/models_mixed/','weighted_merge_'+str(key)+"_snap_event")

# Topic Classification

In [None]:
train_df['majority_topic_1'] = train_df[train_df.columns[13:20]].idxmax(axis=1).str.split(".").str.get(-1)
train_df['majority_topic_2'] = train_df[train_df.columns[20:27]].idxmax(axis=1).str.split(".").str.get(-1)
train_df['majority_same_topic']=train_df['majority_topic_1'] == train_df['majority_topic_2']
test_df['majority_topic_1'] = test_df[test_df.columns[13:20]].idxmax(axis=1).str.split(".").str.get(-1)
test_df['majority_topic_2'] = test_df[test_df.columns[20:27]].idxmax(axis=1).str.split(".").str.get(-1)
test_df['majority_same_topic']=test_df['majority_topic_1'] == test_df['majority_topic_2']

In [None]:
classifiers_event = {}

In [None]:
for weight_factor in np.arange(0.1,1.0,0.2):
    models = {
           "XGBoost" : XGBClassifier()}
    print('------------------------------------------------')
    print('Running for Weight Factor - ', weight_factor)
    transform = MergedTransform(weight_factor=weight_factor)
    wrapper_event = EstimatorSelectionHelperWrapper(models, params, y_label='majority_same_topic')
    wrapper_event.fit(transform.transform(train_df))
    classifiers_event[weight_factor] = wrapper_event

In [None]:
for key in classifiers_event:
    print('====================================================')
    print('Running for Weight Factor - ', key)
    classifiers_event[key].predict(transform.transform(test_df))
#     classifiers_event[key].save_models('../../data/models/models_mixed/','weighted_merge_'+str(key)+"_snap_topic")