In [17]:
import collections

from web.models import Article, DateEntity
from nltk import sent_tokenize
from django.shortcuts import _get_queryset

In [18]:
articles = Article.objects.filter(is_ground_truth = 1, classification_score = 1)

In [19]:
def get_object_or_None(klass, *args, **kwargs):
    queryset = _get_queryset(klass)
    try:
        return queryset.get(*args, **kwargs)
    except queryset.model.DoesNotExist:
        return None

<h1>Article classification</h1>

In [135]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [147]:
df = pd.DataFrame.from_records(Article.objects.all().values()) # Load articles into df
df = df.loc[df['is_ground_truth'] == 1] # Only keep labeled data
df['classification_score'] = (df['classification_score'] >= 1).astype(bool) # Convert classification_score to bool
df = df.sample(frac=1) # Shuffle
print("Articles labeled as strikes: {}.\nRest: {}".format(positive_size, len(df) - positive_size))

Articles labeled as strikes: 50.
Rest: 107


Split dataset into equal-sized classes

In [148]:
positive = df.loc[df['classification_score'] == True]
positive_size = positive.shape[0]
negative = df.loc[df['classification_score'] == False].sample(positive_size)

#df = pd.concat([positive, negative])
df = df.sample(frac=1)

print("Articles labeled as strikes: {}.\nRest: {}".format(positive_size, len(df) - positive_size))

Articles labeled as strikes: 50.
Rest: 107


In [149]:
def classify(df, dictionary, size=None):
    patterns = [re.compile('|'.join(r'{}'.format(word) for word in group), re.IGNORECASE) for group in dictionary]
    return [any(pattern.search(article['body'][:size]) is not None for pattern in patterns) for _, article in df.iterrows()]

In [171]:
stopwords = (('drone', 'uav'), ('strike',))
pred_y = classify(df, stopwords, 150)
true_y = df['classification_score']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [172]:
metrics.accuracy_score(true_y, pred_y)

0.8726114649681529

In [173]:
print(metrics.classification_report(true_y, pred_y))

              precision    recall  f1-score   support

       False       0.95      0.86      0.90       107
        True       0.75      0.90      0.82        50

   micro avg       0.87      0.87      0.87       157
   macro avg       0.85      0.88      0.86       157
weighted avg       0.89      0.87      0.88       157



In [174]:
print(metrics.confusion_matrix(true_y, pred_y))

[[92 15]
 [ 5 45]]


<h1>Data extraction:</h1>

In [20]:
# Get sentence in which label appears in article
def get_sentence_label(label, article=None):
    if(not label):
        return ""
    if(not article):
        article = label.seed
    
    len_read = 0
    label = label.get_dict()
    sentences = sent_tokenize(article.body)
    for sentence in sentences:
        if 0 < label['start_index'] - len_read < len(sentence):
            return sentence
        
        len_read += len(sentence)
        
    return sentences[-1]

In [21]:
counter = collections.Counter([])
for article in articles:
    sentence = get_sentence_label(get_object_or_None(DateEntity, seed=article))
    counter += collections.Counter(sentence.split())
print(counter)

Counter({'the': 62, 'in': 39, 'of': 21, 'on': 20, 'carried': 17, 'said': 16, 'out': 16, 'airstrike': 12, 'was': 12, 'district.': 12, 'The': 11, 'were': 10, 'a': 9, 'Silab': 8, 'Corps': 8, 'statement': 8, 'Afghan': 7, 'late': 7, 'US': 7, '201st': 6, 'Wednesday': 6, 'militants': 6, 'According': 6, 'to': 6, 'Military': 5, 'East': 5, 'airstrikes': 5, 'killed': 5, 'vicinity': 5, 'district': 5, 'provincial': 5, 'government': 5, 'night': 5, 'Achin': 4, 'latest': 4, 'media': 4, 'drone': 4, 'least': 4, 'A': 4, 'area': 4, 'local': 4, 'officials,': 4, 'security': 4, 'that': 3, 'Thursday': 3, 'forces': 3, 'Haska': 3, 'Mina': 3, 'office': 3, 'past': 3, 'and': 3, 'strike': 3, 'Nangarhar': 3, 'officials': 3, 'by': 3, 'leaving': 3, 'dead.': 3, 'at': 3, 'Tuesday': 3, 'night.': 2, 'hours': 2, '24': 2, 'province': 2, 'targeted': 2, 'Monday,': 2, 'two': 2, 'restive': 2, 'commander': 2, 'Pakistan': 2, 'Tuesday.': 2, 'an': 2, 'evening': 2, 'police': 2, 'conducted': 2, 'Kot': 2, 'Monday': 2, 'Chardara': 2, '