<a href="https://colab.research.google.com/github/blue-create/langlens/blob/main/models/stepwise_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Stepwise Classificaiton of Labels
1. DV vs. nonDV
2. problematic vs. non-problematic
3. category

### Imports

In [3]:
%%capture
!pip install transformers==4.20.0

In [4]:
# PACKAGES
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import plotly.express as px
from transformers import AutoTokenizer
from sklearn.manifold import TSNE
# MODELLING
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from scipy.stats import randint

In [5]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')
# change cwd
%cd drive/MyDrive/Work/Frontline/data/


Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [6]:
# CUSTOM PACKAGES
from scripts import annotations

### Exporting annotations from Elinor

In [7]:
# list of dfs with all annotated datasets
dfs={}
for doc in os.listdir("annotated/new_ontology"):
  if doc.endswith(".json"):
    #read json data
    json_data=json.load(open("annotated/new_ontology/"+doc, encoding="utf-8"))
    #convert to dataframe
    data=pd.DataFrame(json_data["documents"])
    data.loc[:,"file"]=doc
    dfs[doc]=data

data=pd.concat(dfs,ignore_index=True)

Extract:artikel_id, titel, annotations


In [8]:
data.loc[:,"artikel_id"]=data.attributes_flat.apply(lambda x: x["artikel_id"])
data.loc[:,"titel"]=data.attributes_flat.apply(lambda x: x["titel"])
data.loc[:,"annotations"]=data.loc[:,"annotations"].apply(annotations.extract_annotations)


### 1."Domestic Violence" vs. not "Domestic Violence"

Binary Label: Other/ Domestic Violence

In [9]:
data.loc[:,"label"]=data.annotations.apply(lambda x: "Other" if x=={} else "Domestic Violence")

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474k [00:00<?, ?B/s]

In [11]:
tokens=data.text.apply(lambda x: tokenizer(x,padding="max_length")["input_ids"])

In [12]:
cat = {'Other': 1,'Domestic Violence': 0}

In [13]:
y= [cat[item] for item in data.label]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(tokens, y, test_size=0.20)
X_train = X_train.apply(pd.Series).to_numpy()
X_test = X_test.apply(pd.Series).to_numpy()

KNN

In [17]:
# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

# Predict y data with classifier:
y_predict = classifier.predict(X_test)

# Print results:
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[ 22 151]
 [ 71 356]]
              precision    recall  f1-score   support

           0       0.24      0.13      0.17       173
           1       0.70      0.83      0.76       427

    accuracy                           0.63       600
   macro avg       0.47      0.48      0.46       600
weighted avg       0.57      0.63      0.59       600



Random Forest

In [19]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [20]:
confusion_matrix(y_test, y_pred)

array([[  3, 170],
       [  3, 424]])

GradientBoostingTree

In [22]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, max_depth=10, random_state=0).fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [23]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  9 164]
 [ 10 417]]
              precision    recall  f1-score   support

           0       0.47      0.05      0.09       173
           1       0.72      0.98      0.83       427

    accuracy                           0.71       600
   macro avg       0.60      0.51      0.46       600
weighted avg       0.65      0.71      0.62       600



### 2.Problematic vs. nonProblematic Articles

In [27]:
data_dv=data[data.annotations.apply(len)!=0]

In [None]:
data_dv.loc[:,"label"]=data_dv.annotations.apply(lambda x: "DV" if x["K"]=={"Domestic Violence"} else "Problematic" )
data_dv.label.value_counts()

In [33]:
tokens=data_dv.text.apply(lambda x: tokenizer(x,padding="max_length")["input_ids"])

In [34]:
cat = {'DV': 1,'Problematic': 0}

In [36]:
y= [cat[item] for item in data_dv.label]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(tokens, y, test_size=0.20)
X_train = X_train.apply(pd.Series).to_numpy()
X_test = X_test.apply(pd.Series).to_numpy()

KNN

In [48]:
# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train)

# Predict y data with classifier:
y_predict = classifier.predict(X_test)

# Print results:
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[  2  24]
 [  9 140]]
              precision    recall  f1-score   support

           0       0.18      0.08      0.11        26
           1       0.85      0.94      0.89       149

    accuracy                           0.81       175
   macro avg       0.52      0.51      0.50       175
weighted avg       0.75      0.81      0.78       175



Random Forest

In [39]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [40]:
confusion_matrix(y_test, y_pred)

array([[  2,  24],
       [  0, 149]])

GradientBoostingTree

In [41]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, max_depth=10, random_state=0).fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [42]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  2  24]
 [  1 148]]
              precision    recall  f1-score   support

           0       0.67      0.08      0.14        26
           1       0.86      0.99      0.92       149

    accuracy                           0.86       175
   macro avg       0.76      0.54      0.53       175
weighted avg       0.83      0.86      0.81       175



### 3. Focus Categories

In [70]:
data_cat=data_dv[data_dv.label=="Problematic"]

In [None]:
data_cat.loc[:,"label"]=data_cat.annotations.apply(lambda x: list(x["K"]))

In [None]:

data_cat["label"]=["*".join(i) for i in data_cat.label]

In [74]:
y=data_cat.label.str.get_dummies(sep="*")

In [67]:
tokens=data_cat.text.apply(lambda x: tokenizer(x,padding="max_length")["input_ids"])

In [76]:
X_train, X_test, y_train, y_test = train_test_split(tokens, y, test_size=0.20)
X_train = X_train.apply(pd.Series).to_numpy()
X_test = X_test.apply(pd.Series).to_numpy()

KNN

In [77]:
# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train)

# Predict y data with classifier:
y_predict = classifier.predict(X_test)

# Print results:
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

ValueError: ignored

Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[  2,  24],
       [  0, 149]])

GradientBoostingTree

In [None]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, max_depth=10, random_state=0).fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  2  24]
 [  1 148]]
              precision    recall  f1-score   support

           0       0.67      0.08      0.14        26
           1       0.86      0.99      0.92       149

    accuracy                           0.86       175
   macro avg       0.76      0.54      0.53       175
weighted avg       0.83      0.86      0.81       175

