In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import nltk
import inflect
import numpy as np
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
tqdm.pandas()
from nltk import sent_tokenize
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn import naive_bayes
from sklearn.utils.class_weight import compute_sample_weight

from nlp_surveillance.classifier import extract_sentence
from nlp_surveillance.pipeline import ExtractSentencesAndLabel, RecommenderLabeling, RecommenderTierAnnotation
from nlp_surveillance.classifier import summarize
from utils.my_utils import split_list_and_distribute_to_new_rows

# Count/Date Classifier: Most Informative Words

In [7]:
df_sent = ExtractSentencesAndLabel('counts').data_output()
df_sent['sentence']= df_sent['sentence'].apply(lambda x: list(set(x.split()) - set(stopwords.words('english'))))
df_sent.head()

Unnamed: 0,sentence,label
0,"[Nigeria, response, Centre, 5577×Contact, +234...",False
1,"[Nigeria, response, Centre, 5577×Contact, +234...",False
2,"[Nigeria, response, Centre, 5577×Contact, +234...",False
3,"[facilitiesSurvivors, United, event, Western, ...",False
4,"[facilitiesSurvivors, United, event, Western, ...",False


In [8]:
df_sent = split_list_and_distribute_to_new_rows(df_sent, 'sentence')
df_sent.head()

Unnamed: 0,label,sentence
0,False,Nigeria
1,False,response
2,False,Centre
3,False,5577×Contact
4,False,+234(0)809


In [10]:
as_tuples = df_sent.apply(tuple, axis=1).tolist()
as_tuples = [({'word':word}, label) for label, word in as_tuples]
as_tuples

[({'word': 'Nigeria'}, False),
 ({'word': 'response'}, False),
 ({'word': 'Centre'}, False),
 ({'word': '5577×Contact'}, False),
 ({'word': '+234(0)809'}, False),
 ({'word': 'diseases.Connect'}, False),
 ({'word': 'epidemics'}, False),
 ({'word': '9700'}, False),
 ({'word': 'handlesToll'}, False),
 ({'word': 'year'}, False),
 ({'word': 'detection'}, False),
 ({'word': '711'}, False),
 ({'word': '2011'}, False),
 ({'word': 'challenges'}, False),
 ({'word': 'CentreToll'}, False),
 ({'word': '+234(0)708'}, False),
 ({'word': 'prevention,'}, False),
 ({'word': 'Disease'}, False),
 ({'word': 'NCDCThe'}, False),
 ({'word': 'emergencies'}, False),
 ({'word': 'non-communicable'}, False),
 ({'word': '(NCDC)'}, False),
 ({'word': 'health'}, False),
 ({'word': '955'}, False),
 ({'word': 'communicable'}, False),
 ({'word': '0010Whatsapp:'}, False),
 ({'word': 'Nigeria’s'}, False),
 ({'word': 'Subscribe'}, False),
 ({'word': 'Free'}, False),
 ({'word': '0800'}, False),
 ({'word': 'Number:'}, False)

In [11]:
clf = nltk.NaiveBayesClassifier.train(as_tuples)
clf.show_most_informative_features(20)

Most Informative Features
                    word = 'variant'        True : False  =     31.1 : 1.0
                    word = 'poultry'        True : False  =     27.1 : 1.0
                    word = 'Laibin'         True : False  =     22.2 : 1.0
                    word = '42-year-old'    True : False  =     22.2 : 1.0
                    word = 'strains.'       True : False  =     19.2 : 1.0
                    word = '13For'          True : False  =     19.2 : 1.0
                    word = 'H1N2'           True : False  =     19.2 : 1.0
                    word = 'straight'       True : False  =     19.2 : 1.0
                    word = 'provinces.Aug'   True : False  =     19.2 : 1.0
                    word = 'Ohio.The'       True : False  =     19.2 : 1.0
                    word = 'fair,'          True : False  =     19.2 : 1.0
                    word = '(H1N2v)'        True : False  =     19.2 : 1.0
                    word = 'desert'         True : False  =     17.3 : 1.

In [12]:
df_sent_date = ExtractSentencesAndLabel('dates').data_output()
df_sent_date['sentence']= df_sent_date['sentence'].apply(lambda x: list(set(x.split()) - set(stopwords.words('english'))))
df_sent_date = split_list_and_distribute_to_new_rows(df_sent_date, 'sentence')
as_tuples_date = df_sent_date.apply(tuple, axis=1).tolist()
as_tuples_date = [({'sent':word}, label) for label, word in as_tuples_date]

In [13]:
clf_date = nltk.NaiveBayesClassifier.train(as_tuples_date)
clf_date.show_most_informative_features(20)

Most Informative Features
                    sent = 'worm'           True : False  =      6.0 : 1.0
                    sent = 'occurring'      True : False  =      5.3 : 1.0
                    sent = 'Northern'       True : False  =      5.3 : 1.0
                    sent = 'emerging'       True : False  =      5.3 : 1.0
                    sent = 'patients'      False : True   =      4.5 : 1.0
                    sent = 'South'          True : False  =      4.1 : 1.0
                    sent = 'deaths'        False : True   =      3.9 : 1.0
                    sent = 'responseJoint'   True : False  =      3.8 : 1.0
                    sent = '—'              True : False  =      3.8 : 1.0
                    sent = '24,'            True : False  =      3.8 : 1.0
                    sent = 'Ireland'        True : False  =      3.8 : 1.0
                    sent = 'Referral'       True : False  =      3.8 : 1.0
                    sent = 'Kingdom'        True : False  =      3.8 : 1.

# Recommendation Classifier

In [14]:
df_recommand = RecommenderLabeling().data_output()

In [15]:
df_tiers = RecommenderTierAnnotation().data_output()

In [17]:
df_tiers.head()

Unnamed: 0,counts,date,diseases,geoname,label,text
0,"[357, 60]","[[2018-12-06 00:00:00, 2018-12-27 00:00:00], [...",Ebola hemorrhagic fever,Democratic Republic of the Congo,False,Ebola virus disease – Democratic Republic of t...
1,[1],"[[2018-10-31 00:00:00, 2018-12-01 00:00:00]]",,Kingdom of Saudi Arabia,False,Middle East respiratory syndrome coronavirus (...
2,"[6, 1, 5]","[[2016-01-01 00:00:00, 2019-01-01 00:00:00]]",typhoid fever,Islamic Republic of Pakistan,False,Typhoid fever – Islamic Republic of PakistanDi...
3,[18],"[[2018-08-01 00:00:00, 2018-09-01 00:00:00]]",Ebola hemorrhagic fever,Democratic Republic of the Congo,False,Ebola virus disease – Democratic Republic of t...
4,"[1, 1]","[[2018-11-18 00:00:00, 2018-11-19 00:00:00]]",yellow fever,Gambia,False,Yellow Fever – Kingdom of the NetherlandsDisea...


In [18]:
df_clf = pd.concat([df_recommand, df_tiers],axis=1, ignore_index=False)

In [19]:
df_clf = df_clf.dropna()

In [20]:
df_clf.head()

Unnamed: 0,extracted_text,label,counts,date,diseases,geoname,label.1,text
0,Ebola virus disease – Democratic Republic of t...,False,"[357, 60]","[[2018-12-06 00:00:00, 2018-12-27 00:00:00], [...",Ebola hemorrhagic fever,Democratic Republic of the Congo,False,Ebola virus disease – Democratic Republic of t...
2,Typhoid fever – Islamic Republic of PakistanDi...,False,"[6, 1, 5]","[[2016-01-01 00:00:00, 2019-01-01 00:00:00]]",typhoid fever,Islamic Republic of Pakistan,False,Typhoid fever – Islamic Republic of PakistanDi...
3,Ebola virus disease – Democratic Republic of t...,False,[18],"[[2018-08-01 00:00:00, 2018-09-01 00:00:00]]",Ebola hemorrhagic fever,Democratic Republic of the Congo,False,Ebola virus disease – Democratic Republic of t...
4,Yellow Fever – Kingdom of the NetherlandsDisea...,False,"[1, 1]","[[2018-11-18 00:00:00, 2018-11-19 00:00:00]]",yellow fever,Gambia,False,Yellow Fever – Kingdom of the NetherlandsDisea...
5,Ebola virus disease – Democratic Republic of t...,False,"[51, 49, 2, 17]","[[2018-10-01 00:00:00, 2018-11-01 00:00:00]]",Ebola hemorrhagic fever,Democratic Republic of the Congo,False,Ebola virus disease – Democratic Republic of t...


In [None]:
list(lambda x: operator.contains(*x), product)

In [74]:
df_clf['counts'] = df_clf['counts'].apply(lambda x: list(set(x)))

In [80]:
num_to_str = inflect.engine()
num_to_str.

In [79]:
num_to_str = inflect.engine()
df_clf['counts'] = df_clf['counts'].apply(int(np.nanmean)).apply(lambda x: num_to_str)

In [78]:
df_clf = split_list_and_distribute_to_new_rows(df_clf, 'counts')

ValueError: operands could not be broadcast together with shape (4202,) (2101,)

In [None]:
to_train_d['features'] = to_train_d[['geoname', 'diseases', 'counts']].values.tolist()

In [None]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = RecommenderLabeling().data_output()

In [None]:
df['annotated'] = df['annotated'].apply(lambda x: x.text)
df['annotated'] = df['annotated'].apply(lambda x: set(x.split()) - set(sw))
df['annotated'] = df['annotated'].apply(lambda x: ' '.join(list(x)))

In [None]:
from nltk.corpus import stopwords

In [None]:
sw = stopwords.words('english')

# Oversample

In [None]:
pipe1 = make_pipeline_imb(TfidfVectorizer(),
                         ADASYN(),
                         MultinomialNB())
pipe2 = make_pipeline_imb(TfidfVectorizer(),
                         ADASYN(),
                         MultinomialNB())
pipe3 = make_pipeline_imb(TfidfVectorizer(),
                         MultinomialNB())

In [None]:

count = to_train_d['counts'].replace(-np.inf, 0.).apply(int).apply(lambda x: p.number_to_words(x))

In [None]:
to_train_d['features'] = to_train_d['features'].apply(lambda x: ' '.join(x))

In [None]:
(X_train, X_test, y_train, y_test) = \
    ms.train_test_split(df['annotated'], df['label'], test_size=.2)
y_balanced = compute_sample_weight(class_weight='balanced', y=y_train)

In [None]:
# pipe1.fit(X_train, y_train)
# pipe2.fit(X_train, y_train)
pipe3.fit(X_train, y_train)

In [None]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_pred, y_train))  #  TEXT WITHOUT STOPWORDS

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  #  TEXT WITHOUT STOPWORDS

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  #  TEXT WITHOUT STOPWORDS

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  # RAW TEXT

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  # COMBINED

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  # DISEASE

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  # GEONAME

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  # COUNT

In [None]:
set(y_balanced)

In [None]:
y_balanced = y_balanced/min(y_balanced)

In [None]:
y_balanced = [int(np.ceil(i)) for i in y_balanced]

In [None]:
y_test = [int(np.ceil(i)) for i in y_test]

In [None]:
set(y_balanced)

In [None]:
pipe.fit(X_train, y_balanced)
y_pred = pipe.predict(X_test)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))  #  weighted

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
tf = text.TfidfVectorizer()
X = tf.fit_transform(to_train_d['geoname'])
y = to_train_d['label'].apply(int)
y_balanced = compute_sample_weight(class_weight='balanced', y=y)

In [None]:
p = 100 * X.nnz / float(X.shape[0] * X.shape[1])
print(f"Each sample has ~{p:.2f}% non-zero features.")


In [None]:
(X_train, X_test, y_train, y_test) = \
    ms.train_test_split(X, y, test_size=.2)

In [None]:
bnb = ms.GridSearchCV(
    nb.BernoulliNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
bnb.fit(X_train, y_train)

In [None]:
bnb.score(X_test, y_test)


In [None]:
predicted = bnb.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

In [None]:
# We first get the words corresponding to each feature
names = np.asarray(tf.get_feature_names())
# Next, we display the 50 words with the largest
# coefficients.
print(','.join(names[np.argsort(
    bnb.best_estimator_.coef_[0, :])[::-1][:50]]))

# STUFF

In [None]:
import pandas as pd
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import GeonameAnnotator
from epitator.resolved_keyword_annotator import ResolvedKeywordAnnotator
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator 

In [None]:
from nlp_surveillance.pipeline import ScrapeFromURLsAndExtractText

In [None]:
df = ScrapeFromURLsAndExtractText('promed').data_output()

In [None]:
df.head()['extracted_text']

In [None]:
df.to_hdf('delete.hdf', key='df', format='table')

In [None]:
import dask.dataframe as dd

In [None]:
dd1 = dd.read_hdf('delete.hdf', key='df')

In [None]:
dd2 = dd1['extracted_text'].apply(lambda x: AnnoDoc(x).add_tiers(CountAnnotator()))

In [None]:
glob

In [None]:
dd2.compute()

In [None]:
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [None]:
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories)

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
from nlp_surveillance.pipeline import ScrapeFromURLsAndExtractText
ScrapeFromURLsAndExtractText('who').data_output().head()

In [None]:
from nlp_surveillance.pipeline import RecommenderTierAnnotation

In [None]:
from sklearn.metrics import classification

# Embeddings

In [None]:
%matplotlib inline  
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.base import TransformerMixin
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [None]:
class MeanEmbeddingTransformer(TransformerMixin):
    
    def __init__(self):
        self._vocab, self._E = self._load_words()
        
    
    def _load_words(self):
        E = {}
        vocab = []

        with open('nlp_surveillance/glove.6B.50d.txt', 'r', encoding="utf8") as file:
            for i, line in enumerate(file):
                l = line.split(' ')
                if l[0].isalpha():
                    v = [float(i) for i in l[1:]]
                    E[l[0]] = np.array(v)
                    vocab.append(l[0])
        return np.array(vocab), E            

    
    def _get_word(self, v):
        for i, emb in enumerate(self._E):
            if np.array_equal(emb, v):
                return self._vocab[i]
        return None
    
    def _doc_mean(self, doc):
        return np.mean(np.array([self._E[w.lower().strip()] for w in doc if w.lower().strip() in self._E]), axis=0)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.array([self._doc_mean(doc) for doc in X])
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [None]:
def plot_roc(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    plt.plot(fpr, tpr)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    
def print_scores(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification.accuracy_score(y_pred,y_test))
    print(confusion_matrix(y_pred,y_test))
    print('F1 score: {:3f}'.format(f1_score(y_test, y_pred)))
    print('AUC score: {:3f}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
import pandas as pd
df = pd.read_csv('data/recommender/with_label.csv')

In [None]:
df.memory_usage(deep=True).sum()

In [None]:
df['label'].value_counts()

In [None]:
X = df['extracted_text'].values
y = df['label'].values

In [None]:
def tokenize_and_transform(X, sample_size):
    essays1 = X
    tok_es1 = [word_tokenize(doc) for doc in essays1[:sample_size]]
    met = MeanEmbeddingTransformer()
    X_transform = met.fit_transform(tok_es1)
    return X_transform

In [None]:
X_transform = tokenize_and_transform(X, 3200)

In [None]:
np.savetxt('X_embed.csv', X_transform, delimiter=',')

In [None]:
X_transform = np.loadtxt('X_embed.csv', delimiter=',')

In [None]:
np.shape(X_transform)

In [None]:
np.shape(y)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_resample,
#                                                     y_resample, stratify=y_resample, random_state=0)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transform,
                                                    y[:3200], random_state=0)
rus = RandomUnderSampler(random_state=0)
X_resample, y_resample = rus.fit_sample(X_train, y_train)


In [None]:
rus = RandomOverSampler(random_state=0)
X_resample, y_resample = rus.fit_sample(X_transform, y[:X_transform.shape[0]])

In [None]:
lr = LogisticRegression()
print_scores(lr, X_resample, y_resample, X_test, y_test)
plot_roc(lr, X_test, y_test)

In [None]:
knn = KNeighborsClassifier()
print_scores(knn, X_resample, y_resample, X_test, y_test)
plot_roc(knn, X_test, y_test)

In [None]:
rf = RandomForestClassifier().fit(X_resample, y_resample)
print_scores(rf, X_resample, y_resample, X_test, y_test)
plot_roc(rf, X_test, y_test)

In [None]:
svc = SVC().fit(X_resample, y_resample)
print_scores(svc, X_resample, y_resample, X_test, y_test)
plot_roc(svc, X_test, y_test)

In [None]:
svc = LinearSVC().fit(X_resample, y_resample)
print_scores(svc, X_resample, y_resample, X_test, y_test)
plot_roc(svc, X_test, y_test)

In [None]:
dtc = DecisionTreeClassifier().fit(X_resample, y_resample)
y_pred = dtc.predict(X_test)
print_scores(dtc, X_resample, y_resample, X_test, y_test)
plot_roc(dtc, X_test, y_test)

In [None]:
mlp = MLPClassifier().fit(X_resample, y_resample)
print_scores(mlp, X_resample, y_resample, X_test, y_test)
plot_roc(mlp, X_test, y_test)

In [None]:
gs = GridSearchCV(LogisticRegression(), 
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1]}, scoring="roc_auc", cv=4)
gs = gs.fit(X_resample, y_resample)
print(gs.best_params_)
print('best score: {:3f}'.format(gs.best_score_))
plot_roc(gs, X_resample, y_resample)

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
gs = GridSearchCV(LinearSVC(), 
             param_grid=param_grid, scoring="roc_auc", cv=4)
gs = gs.fit(X_resample, y_resample)
print(gs.best_params_)
print('best score: {:3f}'.format(gs.best_score_))
plot_roc(gs, X_resample, y_resample)

In [None]:
param_grid = {'activation': ['relu', 'logistic', 'tanh'],
              'alpha': [0.0001, 0.001, 0.01],
              'learning_rate': ['constant', 'invscaling', 'adaptive'], 'tol': [0.01]}
gs = GridSearchCV(MLPClassifier(), 
             param_grid=param_grid, scoring="roc_auc", cv=4)
gs = gs.fit(X_transform, y[:3200])
print(gs.best_params_)
print('best score: {:3f}'.format(gs.best_score_))
plot_roc(gs, X_resample, y_resample)

In [None]:
from sklearn.cluster import KMeans

In [None]:
X_transform_cluster = KMeans(n_clusters=5).fit_transform(X_transform, y[:3200])

In [None]:
rus = RandomUnderSampler(random_state=0)
X_resample_cluster, y_resample_cluster = rus.fit_sample(X_transform_cluster, y[:X_transform_cluster.shape[0]])

In [None]:
X_train_cluster, X_test_cluster, y_train_cluster, y_test_cluster = train_test_split(X_resample_cluster,
                                                    y_resample_cluster, stratify=y_resample_cluster, random_state=0)

In [None]:
lr = LogisticRegression()
print_scores(lr, X_train_cluster, y_train_cluster, X_test_cluster, y_test_cluster)
plot_roc(lr, X_test_cluster, y_test_cluster)

In [None]:
from sklearn.decomposition import PCA

In [None]:
X_transform_pca = PCA().fit_transform(X_transform, y[:3200])

In [None]:
rus = RandomUnderSampler(random_state=0)
X_resample_pca, y_resample_pca = rus.fit_sample(X_transform_pca, y[:X_transform_pca.shape[0]])

In [None]:
X_train_cluster, X_test_cluster, y_train_cluster, y_test_cluster = train_test_split(X_resample_cluster,
                                                    y_resample_cluster, stratify=y_resample_cluster, random_state=0)

In [None]:
lr = LogisticRegression()
print_scores(lr, X_train_cluster, y_train_cluster, X_test_cluster, y_test_cluster)
plot_roc(lr, X_test_cluster, y_test_cluster)

# t-SNE

In [None]:
np.shape(X_transform)

In [None]:
np.shape(y)

In [None]:
from sklearn.manifold import TSNE
import time

In [None]:
n_sne = 3200
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=500)
tsne_results = tsne.fit_transform(X_transform)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
import pandas as pd
df_tsne = pd.DataFrame(tsne_results, columns=['x', 'y'])
df_tsne['labels'] = y[:3200]

In [None]:
from plotnine import *
theme_set(theme_bw())
(ggplot(df_tsne, aes(x='x', y='y', color='labels')) 
 + geom_point()
 + xlab("t-SNE-x") + ylab("t-SNE-y") + ggtitle("doc embedding t-SNE")
 + scale_color_manual(labels = (True, False), values = ("pink", "purple"))

)

In [None]:
from nlp_surveillance.scraper import who_scraper

In [None]:
who_scraper.scrape()