# Setup

In [169]:
# similarity function
from scipy import spatial
import pandas as pd

import json
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


def cos_similarity(a, b):
    return 1 - spatial.distance.cosine(a, b)


# load data
challenge = pd.read_csv("data/challenge.csv")
cnn = pd.read_csv("data/cnn_samples.csv")
federal = pd.read_csv("data/federal_samples.csv")

# fix embedding column
df = pd.concat([cnn, federal], ignore_index=True)
# apply eval on embedding column
df["embeddings"] = df["embeddings"].apply(lambda x: eval(x))

with open('data/mystery.json') as f:
    mystery = json.load(f)


# store challenge embeddings
all_embeddings = [eval(emb) for emb in challenge['embeddings']]
all_embeddings.append(mystery["embedding"])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/benedictneo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Compute similarity for each embedding

In [167]:
# for each embedding in all_embeddings, compute cosine similarity with embeddings in the df and return the top 5
# and from that top 5, apply topic modelling on the text column and return the top 5 topics

def get_top_5_similarities(df):

    top_5_similarities = []

    for embedding in all_embeddings:
        df["cosine"] = df["embeddings"].apply(
            lambda x: cos_similarity(embedding, x)
        )
        top_5_similarities.append(df.sort_values(by="cosine", ascending=False).head(5).text.tolist())

    return top_5_similarities

embeddings_df = get_top_5_similarities(df)
embeddings_df[0][0][:100]

'LONDON, England (CNN) -- The Screening Room went to the Theatre Royal in Drury Lane in the heart of '

In [185]:
# for each text in embeddings_df, perform topic modelling and return the top 5 topics

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD


for i, text_list in enumerate(embeddings_df):

    # count vectorizer
    cv = CountVectorizer(stop_words="english")
    cv.fit(text_list)
    vectorized_text = cv.transform(text_list)

    # LDA
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(vectorized_text)
    lda_output = lda.transform(vectorized_text)

    # NMF 
    nmf = NMF(n_components=5, random_state=42, init="nndsvda")
    nmf.fit(vectorized_text)
    nmf_output = nmf.transform(vectorized_text)

    # SVD
    svd = TruncatedSVD(n_components=5, random_state=42)
    svd.fit(vectorized_text)
    svd_output = svd.transform(vectorized_text)

    # get top 5 topics for each model
    lda_top_5 = []
    nmf_top_5 = []
    svd_top_5 = []

    for topic in lda.components_:
        lda_top_5.append([cv.get_feature_names_out()[i] for i in topic.argsort()[-5:]])

    for topic in nmf.components_:
        nmf_top_5.append([cv.get_feature_names_out()[i] for i in topic.argsort()[-5:]])

    for topic in svd.components_:
        svd_top_5.append([cv.get_feature_names_out()[i] for i in topic.argsort()[-5:]])

    print(f"Top 5 topics for LDA model for challenge {i+1}:")
    # print topic n for topic in lda_top_5
    for n, topic in enumerate(lda_top_5):
        print(f"Topic {n+1}: {topic}")
    print()

    print(f"Top 5 topics for NMF model for challenge {i+1}:")
    for n, topic in enumerate(nmf_top_5):
        print(f"Topic {n+1}: {topic}")
    print()
    print(f"Top 5 topics for SVD model for challenge {i+1}:")
    for n, topic in enumerate(svd_top_5):
        print(f"Topic {n+1}: {topic}")
    print()

Top 5 topics for LDA model for challenge 1:
Topic 1: ['ciudad', 'real', 'spanish', 'year', 'airport']
Topic 2: ['asia', 'shoot', 'world', 'outside', 'million']
Topic 3: ['theme', 'malaysia', 'legoland', 'park', 'water']
Topic 4: ['asia', 'shoot', 'world', 'outside', 'million']
Topic 5: ['training', 'space', '2014', 'travel', 'dates']

Top 5 topics for NMF model for challenge 1:
Topic 1: ['book', 'valid', '2014', 'travel', 'dates']
Topic 2: ['flight', 'said', 'garriott', 'training', 'space']
Topic 3: ['ciudad', 'real', 'year', 'spanish', 'airport']
Topic 4: ['composer', 'indian', 'film', 'music', 'rahman']
Topic 5: ['lego', 'malaysia', 'legoland', 'park', 'water']

Top 5 topics for SVD model for challenge 1:
Topic 1: ['valid', 'book', '2014', 'travel', 'dates']
Topic 2: ['air', 'garriott', 'said', 'training', 'space']
Topic 3: ['ciudad', 'real', 'spanish', 'year', 'airport']
Topic 4: ['composer', 'indian', 'film', 'music', 'rahman']
Topic 5: ['lego', 'malaysia', 'legoland', 'park', 'wat

# Build classifier

In [208]:
# concatenate cnn and federal samples
df = df[df.source != "FCC"]
df.source.value_counts()

CNN         738
OSHA         25
IRS          24
SEC          20
DOJ          10
EEOC         10
USPTO        10
Treasury     10
HHS          10
Name: source, dtype: int64

In [214]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(df['embeddings'].to_list(), df['source'].to_list(), test_size=0.2, random_state=42)


models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('RF', RandomForestClassifier(n_estimators=100)))
models.append(('AB', AdaBoostClassifier()))
# models.append(('GB', GradientBoostingClassifier()))

results = []
names = []

# evaluate each model with embeddings as features

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

for name, model in models:
    # average weigthed 
    kfold = KFold(n_splits=10, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1_weighted')
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean()}")

LR: 0.9543297988728392
LDA: 0.9481193131320917
KNN: 0.945311847276147
CART: 0.9266355280298602
NB: 0.9005678026058443
SVM: 0.9352742628851487
RF: 0.9364769093792724
AB: 0.826900206396591
GB: 0.9177730581022253


In [215]:
# test logistic regression on test set

lr = LogisticRegression(solver='liblinear', multi_class='ovr')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
f1_score(y_test, y_pred, average='weighted')

0.9557288393044208