<a href="https://colab.research.google.com/github/brunokrp/ai-protectionism/blob/main/model_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Identifying protectionism through a text classification model**

## **SETTING UP ENVIROMENT**

In [None]:
# Installing libraries
!pip install transformers datasets evaluate accelerate
!pip install mapclassify

In [None]:
# Importing libraries
from pathlib import Path

import pandas as pd
import string

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

import sklearn as skl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import seaborn
import geopandas as gpd
import numpy as np

from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

import folium
from shapely.geometry import Point, Polygon
from folium.plugins import HeatMap
from mapclassify import classify
import geopandas

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/"AI INDUSTRIAL POLICY"/"SIPA_TEXT"/"GTA-MINING"

## **PREPROCESSING DATA**

In [None]:
df = pd.read_csv("interventions_with_descriptions_total.csv", index_col="Unnamed: 0")

In [None]:
df_categories = df.copy()

In [None]:
def protectionist_label(description):
  if description == 'Green':
    return 0
  elif description == 'Amber':
    return 0
  else:
    return 1

In [None]:
df_categories['label'] = df_categories['Gta Evaluation'].apply(protectionist_label)

In [None]:
df_categories['label'].value_counts()

In [None]:
df_categories['Gta Evaluation'].value_counts()

## **MODEL TRAINING**

### Test and training data

In [None]:
df_categories_clean = df_categories[['Description', 'label']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_categories_clean['Description'], df_categories_clean['label'], test_size=0.33, random_state=42)

In [None]:
df_train_x = pd.DataFrame(X_train)
df_train_y = pd.DataFrame(y_train)
df_train = df_train_x.join(df_train_y).reset_index(drop=True)

In [None]:
df_test_x = pd.DataFrame(X_test)
df_test_y = pd.DataFrame(y_test)
df_test = df_test_x.join(df_test_y).reset_index(drop=True)

In [None]:
# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Create a DatasetDict
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

### Training

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['Description'], truncation=True)

In [None]:
tokenized_df = dataset_dict.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
f1_score = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1_score.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="classification_model_protectionism",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### Loading GPA dataset

In [None]:
%cd /content/drive/MyDrive/"AI INDUSTRIAL POLICY"/"SIPA_TEXT"

digital_policies_df = pd.read_csv("digital_policies.csv")
digital_policies_df.drop_duplicates(subset=['Latest Event Description'], inplace=True)
ai_policies_df = digital_policies_df[(digital_policies_df['Economic Activities'].str.contains('ML and AI development|Semiconductors|cloud computing') == True)].reset_index()
ai_policies_df_per_country = ai_policies_df.assign(countries=ai_policies_df['Implementing Countries'].str.split(', ')).explode('countries').reset_index(drop=True)

cc_df = pd.read_csv('country_continent.csv')
cc_df = cc_df.rename(columns={'country':'countries'})

%cd /content/drive/MyDrive/"AI INDUSTRIAL POLICY"/"SIPA_TEXT"/"GTA-MINING"

## **INFERENCE AND RESULTS**

### Inference

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="brunokrp/classification_model_protectionism")

In [None]:
predictions = []

for text in ai_policies_df_per_country["Latest Event Description"]:
  predictions.append(classifier(text))

In [None]:
prediction_df = pd.DataFrame([item[0] for item in predictions])
ai_policies_df_per_country_with_labels = ai_policies_df_per_country.join(prediction_df)
ai_policies_df_per_country_with_labels = pd.merge(ai_policies_df_per_country_with_labels,cc_df,on='countries').drop_duplicates()
ai_policies_df_per_country_with_labels["label"] = ai_policies_df_per_country_with_labels["label"].replace({'NEGATIVE': 0, 'POSITIVE': 1})
ai_policies_df_per_country_with_labels = ai_policies_df_per_country_with_labels.drop_duplicates(subset=["Policy Change ID", "region_1"])
policies_per_region_label = ai_policies_df_per_country_with_labels[["region_1", "label"]]
policies_per_region_label = policies_per_region_label.groupby(['region_1']).agg(['count','mean']).reset_index()
policies_per_region_label.columns = ['_'.join(col) for col in policies_per_region_label.columns]

### Plotting in map

In [None]:
%cd /content/drive/MyDrive/"AI INDUSTRIAL POLICY"/"SIPA_TEXT"/"GTA-MINING"/"World_Regions"
gdf = gpd.read_file("World_Regions.shp")
gdf = gdf.rename(columns={'REGION':'region_1_'})

gdf.region_1_.replace("Australia/New Zealand", "Australia and New Zealand", inplace=True)
gdf.region_1_.replace("Southeastern Asia", "South-eastern Asia", inplace=True)

In [None]:
policies_per_region_shapefile = pd.merge(policies_per_region_label,gdf,on='region_1_')
policies_per_region_shapefile["label_mean"] = policies_per_region_shapefile["label_mean"] + 0.001
gdf_policies = geopandas.GeoDataFrame(policies_per_region_shapefile)

In [None]:
m = gdf_policies.explore(
  column = 'label_mean',
  tooltip = ['region_1_', 'label_mean', 'label_count'],
  cmap = 'YlOrRd',
  legend = True,
  popup = True
)

In [None]:
m

In [None]:
policies_per_region_label

### Shap Values

In [None]:
ai_policies_df_per_country["Latest Event Description"].iloc[0:3].to_list()

In [None]:
classifier = pipeline("text-classification", model="brunokrp/classification_model_protectionism")

In [None]:
explainer = shap.Explainer(classifier)
shap_values = explainer(positive_policies["Latest Event Description"].sample(n=50).to_list())

In [None]:
shap.plots.text(shap_values)

In [None]:
shap.plots.bar(shap_values[:, :, "POSITIVE"].mean(0))

In [None]:
shap.plots.bar(shap_values[:, :, "POSITIVE"].mean(0), order=shap.Explanation.argsort)

In [None]:
shap.plots.bar(shap_values[:, :, "POSITIVE"].mean(0), order=shap.Explanation.argsort)

In [None]:
shap.plots.text(shap_values[1])

### Getting examples of each label

In [None]:
ai_policies_df_per_country_with_labels[ai_policies_df_per_country_with_labels["label"]==1]["Latest Event Description"]

In [None]:
ai_policies_df_per_country_with_labels["Latest Event Description"][7596]

In [None]:
ai_policies_df_per_country_with_labels["Latest Event Description"][679]

In [None]:
positive_policies = ai_policies_df_per_country_with_labels[(ai_policies_df_per_country_with_labels["score"]>0.999) & (ai_policies_df_per_country_with_labels["label"]==1)].drop_duplicates(subset=["Latest Event Description"])

### Testing

In [None]:
prediction_df = pd.DataFrame([item[0] for item in predictions])
ai_policies_df_per_country_with_labels = ai_policies_df_per_country.join(prediction_df)

In [None]:
ai_policies_df_per_country_with_labels_2 = pd.merge(ai_policies_df_per_country_with_labels,cc_df,on='countries').drop_duplicates()

In [None]:
ai_policies_df_per_country_with_labels_2["label"] = ai_policies_df_per_country_with_labels_2["label"].replace({'NEGATIVE': 0, 'POSITIVE': 1})

In [None]:
ai_policies_df_per_country_with_labels_2.drop_duplicates(subset=["Policy Change ID", "region_1"]).head()

In [None]:
policies_per_region_label = ai_policies_df_per_country_with_labels[["region_1", "label"]]
policies_per_region_label = policies_per_region_label.groupby(['region_1']).agg(['count','mean']).reset_index()
policies_per_region_label.columns = ['_'.join(col) for col in policies_per_region_label.columns]