<a href="https://colab.research.google.com/github/brunokrp/ai-protectionism/blob/main/model_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CLASSIFICATION MODEL**

## **SETTING UP ENVIROMENT**

#### **Installing libraries, importing them, connecting to Google Drive and to Hugging Face**

In [None]:
# Installing libraries
!pip install transformers datasets evaluate accelerate
!pip install mapclassify

In [None]:
# Importing libraries
from pathlib import Path

import pandas as pd
import string

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

import sklearn as skl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import seaborn
import geopandas as gpd
import numpy as np

from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

from mapclassify import classify
import geopandas

In [None]:
# Logging in Hugging Face
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Connecting to Google Drive

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/"[Folder path here]"

## **PREPROCESSING DATA**

In [None]:
# Loading GTA dataset with descriptions
df = pd.read_csv("interventions_with_descriptions_total.csv", index_col="Unnamed: 0")

In [None]:
# Creating copy to increase code reliability
df_categories = df.copy()

In [None]:
# Transforming labels from string to numbers
# Here I consider Green and Amber interventions as "not protectionist" and Red as "protectionist".
def protectionist_label(description):
  if description == 'Green':
    return 0
  elif description == 'Amber':
    return 0
  else:
    return 1

df_categories['label'] = df_categories['Gta Evaluation'].apply(protectionist_label)

In [None]:
# Checking distribution of labels
df_categories['label'].value_counts()

## **MODEL TRAINING**

#### **Splitting dataset into test and training data**

In [None]:
# Slicing base dataset to contain only with description and label
df_categories_clean = df_categories[['Description', 'label']]

In [None]:
# Splitting dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(df_categories_clean['Description'], df_categories_clean['label'], test_size=0.33, random_state=42)

In [None]:
# Creating a unified training dataset, with descriptions and label values
df_train_x = pd.DataFrame(X_train)
df_train_y = pd.DataFrame(y_train)
df_train = df_train_x.join(df_train_y).reset_index(drop=True)

In [None]:
# Creating a unified testing dataset, with descriptions and label values
df_test_x = pd.DataFrame(X_test)
df_test_y = pd.DataFrame(y_test)
df_test = df_test_x.join(df_test_y).reset_index(drop=True)

In [None]:
# Converting DataFrame to Dataset, which is necessary for training
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Creating a DatasetDict, which is also necessary for training
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

#### **Training**
This following training code was based on https://huggingface.co/docs/transformers/en/tasks/sequence_classification

*The classification model cannot be trained in the basic version of Google Colab due to RAM restrictions. I ran using the PRO version, with a T4-GPU and high-RAM enabled*.

In [None]:
# Creating preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['Description'], truncation=True)

# Creating tokenized dataframe
tokenized_df = dataset_dict.map(preprocess_function, batched=True)

In [None]:
# Importing data collator and creating a batch of examples
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Importing F1 scoring evaluator
import evaluate
f1_score = evaluate.load("f1")

In [None]:
# Creating function to compute metrics during training
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1_score.compute(predictions=predictions, references=labels)

In [None]:
# Mapping label to ID and ID to label. Positive is mapped to 1, which refers to the protectionist class
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
# Importing pretrained DistilBERT model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
    )

In [None]:
# Defining training parameters and running trainer
training_args = TrainingArguments(
    output_dir="classification_model_protectionism",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## **INFERENCE AND RESULTS**

#### **Loading and preprocessing DPA dataset**

In [None]:
# Loading dataset with digital policies (DPA)
%cd /content/drive/MyDrive/"[Folder path with DPA descriptions]"
digital_policies_df = pd.read_csv("digital_policies.csv")

# Dropping duplicate interventions
digital_policies_df.drop_duplicates(subset=['Latest Event Description'], inplace=True)

# Creating dataframe only with AI-related interventions
ai_policies_df = digital_policies_df[(digital_policies_df['Economic Activities'].str.contains('ML and AI development|Semiconductors|cloud computing') == True)].reset_index()

# Exploding dataset to contain observations on the intervention-country level
ai_policies_df_per_country = ai_policies_df.assign(countries=ai_policies_df['Implementing Countries'].str.split(', ')).explode('countries').reset_index(drop=True)

# Loading dataset countries and regions
cc_df = pd.read_csv('country_continent.csv')
cc_df = cc_df.rename(columns={'country':'countries'})

# Getting back to folder with model
%cd /content/drive/MyDrive/"AI INDUSTRIAL POLICY"/"SIPA_TEXT"/"GTA-MINING"

#### **Inference**

In [None]:
# Loading trained model
from transformers import pipeline
classifier = pipeline("text-classification", model="brunokrp/classification_model_protectionism")

In [None]:
# Making predictions for each intervention description
predictions = []

for text in ai_policies_df_per_country["Latest Event Description"]:
  predictions.append(classifier(text))

# Creating a dataframe with all predictions (contains the label and the probability)
prediction_df = pd.DataFrame([item[0] for item in predictions])

In [None]:
# Joining base dataframe with prediction dataframe, then merging it with country/region dataframe
ai_policies_df_per_country_with_labels = ai_policies_df_per_country.join(prediction_df)
ai_policies_df_per_country_with_labels = pd.merge(ai_policies_df_per_country_with_labels,cc_df,on='countries').drop_duplicates()

# Replacing string labels with int labels
ai_policies_df_per_country_with_labels["label"] = ai_policies_df_per_country_with_labels["label"].replace({'NEGATIVE': 0, 'POSITIVE': 1})

# Because each intervention affects more than one country in each region,
# Dropping duplicates to keep only unique intervention-region observations
ai_policies_df_per_country_with_labels = ai_policies_df_per_country_with_labels.drop_duplicates(subset=["Policy Change ID", "region_1"])

# Keeping just two columns to facilitate visualization
policies_per_region_label = ai_policies_df_per_country_with_labels[["region_1", "label"]]

# Grouping dataframe by region
# Getting average of label values
# Getting count of interventions
policies_per_region_label = policies_per_region_label.groupby(['region_1']).agg(['count','mean']).reset_index()

# Removing Multi-Index created by the groupby function using agg
policies_per_region_label.columns = ['_'.join(col) for col in policies_per_region_label.columns]

#### **Plotting in map**

In [None]:
# Getting shapefile from world regions [data extracted from https://hub.arcgis.com/datasets/a79a3e4dc55343b08543b1b6133bfb90/explore]
%cd /content/drive/MyDrive/"[Folder with world regions data]"
gdf = gpd.read_file("World_Regions.shp")

# Renaming columns to facilitate join and replacing two regions values to ensure compatibility between dataframes
gdf = gdf.rename(columns={'REGION':'region_1_'})
gdf.region_1_.replace("Australia/New Zealand", "Australia and New Zealand", inplace=True)
gdf.region_1_.replace("Southeastern Asia", "South-eastern Asia", inplace=True)

In [None]:
# Merging dataframe that contains predictions with shapefiles
policies_per_region_shapefile = pd.merge(policies_per_region_label,gdf,on='region_1_')

# Increasing value of each region by 0.001 to differentiate between regions without policies and regions with no protectionist policies
policies_per_region_shapefile["label_mean"] = policies_per_region_shapefile["label_mean"] + 0.001

# Transforming dataframe into a GeoDataFrame to visualize information in a map
gdf_policies = geopandas.GeoDataFrame(policies_per_region_shapefile)

In [None]:
# Defining map parameters

m = gdf_policies.explore(
  column = 'label_mean',
  tooltip = ['region_1_', 'label_mean', 'label_count'],
  cmap = 'YlOrRd',
  legend = True,
  popup = True
)

In [None]:
# Plotting map
m

#### **Shap Values**
Based on SHAP documentation: https://shap.readthedocs.io/en/latest/example_notebooks/text_examples/sentiment_analysis/Positive%20vs.%20Negative%20Sentiment%20Classification.html

In [None]:
# Creating dataframe only with interventions classified as protectionist with high probability
positive_policies = ai_policies_df_per_country_with_labels[(ai_policies_df_per_country_with_labels["score"]>0.999) & (ai_policies_df_per_country_with_labels["label"]==1)].drop_duplicates(subset=["Latest Event Description"])

In [None]:
# Loading classification model
classifier = pipeline("text-classification", model="brunokrp/classification_model_protectionism")

# Loading explainer model
explainer = shap.Explainer(classifier)

# Getting shap values from a random sample of 50 interventions
# Chose a sample because this process is time and computing intensive
shap_values = explainer(positive_policies["Latest Event Description"].sample(n=50).to_list())

In [None]:
# Plotting text explainer for the sample interventions
shap.plots.text(shap_values)