### Imports

In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm 
#preprocessing
import spacy
import numpy as np
# modelling
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
%%capture
!python -m spacy download de_core_news_lg

In [3]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/
#%cd /content/drive/MyDrive/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [5]:
# custom module
import preprocessing

### Import Data


Merge all annotation files

In [6]:
dfs=[]
for doc in os.listdir("annotated"):
  if doc.startswith("annotations"):
    #read json data
    json_data=json.load(open("annotated/"+doc, encoding="utf-8"))
    #convert to dataframe
    data=pd.DataFrame(json_data["documents"])
    #for now: filter out paragraphs that have not been annotated 
    data=data[data["annotations"].apply(len)>0]    
    dfs.append(data)


In [7]:
# merge jsons
data=pd.concat(dfs)
data=data.reset_index(drop=True)

In [9]:
data["tags"]=[""]*data.shape[0]

In [10]:
# extract document labels
# ASSUMPTION: all annotations are document labels, ie. do not have start and end as those are ignored
 
for idx, entry in data.iterrows():
  # in case there are more than 1 label
  for label in range(len(entry["annotations"])):
    data.loc[idx, "tags"]+=entry["annotations"][label]["concept"]["preferred_label"]["name"]+"*"

In [11]:
df=data[["id","tags", "text"]]

Change old label names to new label names

In [None]:
df.tags=df.tags.str.replace("NA","Domestic Violence")
df.tags=df.tags.str.replace("Victim blaming","Statement of responsibility")

In [13]:
# converting the annotation column with one hot encoding
df=pd.merge(df,df["tags"].str.get_dummies(sep="*"), left_index=True, right_index=True)
# dropping the tags column
df=df.drop("tags",1)

  df=df.drop("tags",1)


In [15]:
### FOR NOW: focus on those with a single label
df=df[df.iloc[:,-4:].sum(1)==1]
df["label"]=df.iloc[:,-4:].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"]=df.iloc[:,-4:].idxmax(axis=1)


### Preprocess Data

In [17]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [18]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [19]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

533it [00:06, 84.17it/s]


In [20]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 533/533 [00:00<00:00, 6137.17it/s]


In [21]:
df["cleaned"]=spacy_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cleaned"]=spacy_cleaned


### Test- Train Split

In [22]:
#split train, dev , test sets
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

### Modelling

In [23]:
#Simple sklearn classifier based on TF-IDF:
def tfidf_classifier(random_state=12345, ngram_range=(1,4)):
   return Pipeline([
       ('vect', CountVectorizer(ngram_range=ngram_range)),
       ('tfidf', TfidfTransformer()),
       ('clf', SGDClassifier(
           loss='modified_huber', penalty='l2',
           alpha=1e-3, random_state=random_state,
           max_iter=5, tol=None, n_jobs=-1)),])

In [24]:
tfidf=tfidf_classifier()

In [25]:
tfidf.fit(df_train["text"],df_train["label"])

In [26]:
df_test["prediction"]=tfidf.predict(df_test["text"])
df_test[["label", "prediction"]]

In [29]:
print(f"Accuracy: ",(df_test["label"]==df_test["prediction"]).sum()/df_test.shape[0])

Accuracy:  0.8333333333333334
