In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

## Load dataset

In [2]:
df = pd.read_csv("../data/disaster-tweets/train.csv"); df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Train a model

In [3]:
clf = make_pipeline(TfidfVectorizer(stop_words="english"), LogisticRegression())

In [4]:
clf.fit(df.text, df.target)

In [29]:
train_pred = clf.predict(df.text)
print(accuracy_score(df.target, train_pred))

0.8892683567581768


In [5]:
parameters = clf.steps[1][1].coef_[0]

In [6]:
parameters

array([ 0.3458073 , -0.11087299,  0.12963351, ...,  0.10882484,
       -0.1002334 ,  0.10963176])

In [7]:
feature_names = clf.steps[0][1].get_feature_names_out()

In [8]:
feature_names

array(['00', '000', '0000', ..., 'ûónegligence', 'ûótech', 'ûówe'],
      dtype=object)

In [9]:
# get top words
param_word_map = {p: w for p, w in zip(parameters, feature_names)}

In [10]:
t = sorted(param_word_map.items(), key=lambda x: -np.abs(x[0]))[:100]

In [11]:
t

[(4.747876498374984, 'http'),
 (3.607802535802156, 'hiroshima'),
 (3.237031945884168, 'fires'),
 (3.1427679248019955, 'california'),
 (2.8203103083323535, 'buildings'),
 (2.739049937078202, 'killed'),
 (2.6237433279746, 'train'),
 (2.596233506929001, 'suicide'),
 (2.5555141964872443, 'bombing'),
 (2.529292352078764, 'storm'),
 (2.5032821278413095, 'wildfire'),
 (2.4126772543959336, 'police'),
 (2.402317684800697, 'typhoon'),
 (2.4007546160901234, 'earthquake'),
 (2.2898249012345855, 'near'),
 (-2.237042623978157, 'new'),
 (2.129746463636853, 'forest'),
 (-2.1115956421405855, 'love'),
 (2.0807910085079744, 'mh370'),
 (2.0723140066758527, 'derailment'),
 (2.0520987394968393, 'japan'),
 (2.0475582681792326, 'drought'),
 (2.037906129169972, 'accident'),
 (1.969345309070003, 'floods'),
 (1.9653117797287716, 'disaster'),
 (1.9506948598293163, 'massacre'),
 (1.9339687344007457, 'crash'),
 (1.9118035595504266, 'mass'),
 (1.9052213841753092, 'evacuated'),
 (1.884731501163538, 'debris'),
 (-1.88

In [12]:
nationalities = ["myanmar", "german", "israel", "japan"]
protected_groups = ["migrants", "woman", "female"]

In [13]:
clf.predict_proba(["I'm an israeli"])

array([[0.30581131, 0.69418869]])

In [14]:
clf.predict_proba(["I'm a muslim."])

array([[0.56024287, 0.43975713]])

In [15]:
clf.predict_proba(["I'm an israeli woman"])

array([[0.46630715, 0.53369285]])

In [19]:
clf.predict_proba(["I'm from japan."])

array([[0.21454639, 0.78545361]])

In [17]:
clf.predict_proba(["We're a family from myanmar."])

array([[0.33445889, 0.66554111]])

In [20]:
clf.predict_proba(["An isreali on a plane"])

array([[0.32783676, 0.67216324]])

In [23]:
clf.predict_proba(["A jew on a plane."])

array([[0.32783676, 0.67216324]])

In [26]:
clf.predict_proba(["A woman on a plane."])

array([[0.49018913, 0.50981087]])

## Look at clusters

In [57]:
from bertopic import BERTopic

vectorizer = TfidfVectorizer(min_df=5)
embeddings = vectorizer.fit_transform(docs)

# Train our topic model using TF-IDF vectors
topic_model = BERTopic(stop_words="english")
topics, probs = topic_model.fit_transform(docs, embeddings)

ModuleNotFoundError: No module named 'bertopic'

In [None]:
topic_model.visualize_documents(docs, embeddings=embeddings)