Copyright (c) Snap Inc. 2020. This sample code is made available by Snap Inc. for informational purposes only. It is provided as-is, without warranty of any kind, express or implied, including any warranties of merchantability, fitness for a particular purpose, or non-infringement. In no event will Snap Inc. be liable for any damages arising from the sample code or your use thereof.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import hashlib
import spacy
import os
import re
import json
from collections import OrderedDict
from operator import itemgetter
from spacy.lang.en.stop_words import STOP_WORDS
import string
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBClassifier
import sys
sys.path.append('../')

from tf_idf_vectorizer import *
from utils.snap_preprocessed_df_handle import *
from utils.EstimatorSelectionHelper import EstimatorSelectionHelper
from utils.classifier_setup import *

In [None]:
models = {
           "XGBoost" : XGBClassifier()
}

params = {'XGBoost':  {"colsample_bytree": [0.3,0.5,0.8,1],"gamma":[0,10,50,100],
                        "max_depth": [2,4,6], # default 3\
                        "n_estimators": [50,100], # default 100
                        "subsample": [0.3,0.5,0.8,1]}
}

def custom_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [None]:
TRAIN_PATH = '../../data/dataframes/df_unique_with_similarity.pkl'

In [None]:
def prepare_dataframe_tf_idf(PATH):
    df_with_keywords = get_dataframe(PATH)
    articles = get_unique_combined_with_id(df_with_keywords, 'Input.article', 'article')
    od_output, od_keys = get_tf_idf(articles, 'article', preprocessor=preprocessor, stop_words=stop_list, ngram_range = (1,1))
    df_with_keywords['tfidf_v1'] = df_with_keywords['id1'].apply(lambda x: od_output[list(od_keys).index(x)])
    df_with_keywords['tfidf_v2'] = df_with_keywords['id2'].apply(lambda x: od_output[list(od_keys).index(x)])
    df_with_keywords['tfidf_similarity'] = df_with_keywords[['tfidf_v1','tfidf_v2']]\
                                        .apply(lambda row: cosine_similarity(row['tfidf_v1'],row['tfidf_v2'])[0][0], axis=1)
    
    return df_with_keywords

In [None]:
train_df = prepare_dataframe_tf_idf(TRAIN_PATH)

In [None]:
test_df = prepare_dataframe_tf_idf('../../data/dataframes/df_test_unique_with_similarity.pkl')

# Event Similarity

In [None]:
helper_event = EstimatorSelectionHelper(models, params)

In [None]:
helper_event.fit(train_df['tfidf_similarity'].values.reshape(-1, 1),
            train_df['majority_same_event'],
            cv = 5,
            scoring=make_scorer(custom_scorer, greater_is_better=True), n_jobs=16, refit=True)

In [None]:
helper_event.summary(test_df['tfidf_similarity'], test_df['majority_same_event'])

# Topical Similarity

In [None]:
train_df['majority_topic_1'] = train_df[train_df.columns[13:20]].idxmax(axis=1).str.split(".").str.get(-1)
train_df['majority_topic_2'] = train_df[train_df.columns[20:27]].idxmax(axis=1).str.split(".").str.get(-1)

In [None]:
train_df['majority_same_topic']=train_df['majority_topic_1'] == train_df['majority_topic_2']

In [None]:
test_df['majority_topic_1'] = test_df[test_df.columns[13:20]].idxmax(axis=1).str.split(".").str.get(-1)
test_df['majority_topic_2'] = test_df[test_df.columns[20:27]].idxmax(axis=1).str.split(".").str.get(-1)

In [None]:
test_df['majority_same_topic']=test_df['majority_topic_1'] == test_df['majority_topic_2']

In [None]:
helper_topic = EstimatorSelectionHelper(models, params)

In [None]:
helper_topic.fit(train_df['tfidf_similarity'].values.reshape(-1, 1),
            train_df['majority_same_topic'],
            cv = 5,
            scoring=make_scorer(custom_scorer, greater_is_better=True), n_jobs=16, refit=True)

In [None]:
helper_topic.summary(test_df['tfidf_similarity'], test_df['majority_same_topic'])