In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_json('ML-ESG-2_English_Train.json')
df.head()

Unnamed: 0,URL,news_title,news_content,impact_type
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,Opportunity
1,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...,Opportunity
2,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: \n“Personalised portfolios demand ...,Opportunity
3,https://www.esgtoday.com/ukraine-war-inflation...,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...,Opportunity
4,https://www.esgtoday.com/eu-regulators-welcome...,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...,Opportunity


In [3]:
from newspaper import Article

def get_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

In [4]:
# texts = df['URL'].apply(get_article_text)

In [5]:
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

scores = []
X = df['news_content']
y = df['impact_type'].map({'Opportunity': 0, 'Risk': 1}).values
for c in np.arange(2, 7):
    mdl = make_pipeline(
        TfidfVectorizer(max_features=1000, stop_words='english'),
        LogisticRegression(C=10**c)
    )

    scores.append(
        cross_validate(mdl, X, y, cv=5, n_jobs=5, scoring=['precision', 'recall', 'f1'])
    )

In [6]:
scores = pd.DataFrame(scores)
scores['precision'] = scores['test_precision'].apply(np.mean)
scores['recall'] = scores['test_recall'].apply(np.mean)
scores['f1'] = scores['test_f1'].apply(np.mean)

In [7]:
scores[['precision', 'recall', 'f1']].assign(c=np.arange(2, 7))

Unnamed: 0,precision,recall,f1,c
0,0.619102,0.324901,0.42003,2
1,0.575238,0.350988,0.432651,3
2,0.578808,0.368775,0.446323,4
3,0.55976,0.368775,0.441381,5
4,0.575934,0.368775,0.446446,6


# ChatGPT

In [14]:
import openai
import os

openai.api_key = os.getenv("OPENAI_API_KEY")

In [16]:
_TRAINING_SAMPLE_PROMPT_TEMPLATE = """
Sample input:
```{x}```

Sample target: {y}
"""
_TEST_SAMPLE_PROMPT_TEMPLATE = """
Input:
```{x}```

Target: 
"""

class GPTClassifier:
    def __init__(self, model="gpt-3.5-turbo", system_prompt="You are text classifier."):
        self.model = model
        self.system_prompt = system_prompt

    def fit(self, X, y):
        prompts = []
        for xt, yt in zip(X, y):
            prompts.append(
                _TRAINING_SAMPLE_PROMPT_TEMPLATE.format(x=xt, y=yt)
            )
        self.prompt_ = "\n".join(prompts)
        return self

    def _predict_single(self, x):
        prompt = self.prompt_ + _TEST_SAMPLE_PROMPT_TEMPLATE.format(x=x)

        response = openai.ChatCompletion.create(
            model=self.model,
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt}
            ],
            max_tokens=64,
            temperature=0,
        )

        return response["choices"][0]["message"]["content"]
    
    def predict(self, X):
        return [self._predict_single(xt) for xt in X]

In [41]:
train = pd.concat([
    df.query('impact_type == "Risk"').sample(5, random_state=42),
    df.query('impact_type == "Opportunity"').sample(5, random_state=42)
]).sample(frac=1, replace=False)

test = pd.concat([
    df.query('impact_type == "Risk"').sample(10, random_state=1),
    df.query('impact_type == "Opportunity"').sample(40, random_state=1)
]).sample(frac=1, replace=False)

In [43]:
X_train, y_train = train['news_content'].values, train['impact_type'].values
X_test, y_test = test['news_content'].values, test['impact_type'].values

In [47]:
clf = GPTClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [48]:
from sklearn.metrics import classification_report

print(classification_report(
    pd.Series(y_test).map({'Opportunity': 0, 'Risk': 1}),
    pd.Series(y_pred).map({'Opportunity': 0, 'Risk': 1}),
))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        40
           1       0.56      0.50      0.53        10

    accuracy                           0.82        50
   macro avg       0.72      0.70      0.71        50
weighted avg       0.81      0.82      0.82        50



# SBERT

In [83]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [84]:
content_embeddings = model.encode(df['news_content'].values)

In [85]:
df_en = pd.read_csv('fr_en.csv')

In [86]:
fr_embeddings = model.encode(df_en['news_content'].values)

In [127]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold 

X = content_embeddings
y = df['impact_type'].map({'Opportunity': 0, 'Risk': 1}).values
X_fr = fr_embeddings
y_fr = df_en['impact_type'].map({'Opportunity': 0, 'Risk': 1}).values

kfold = KFold(shuffle=True, random_state=42)
f1s = []
for i, (train_index, test_index) in enumerate(kfold.split(X)):
    clf = LogisticRegression(C=15, class_weight={1: 5})
    # clf.fit(
    #     np.concatenate([X[train_index], X_fr]),
    #     np.concatenate([y[train_index], y_fr])
    # )
    clf.fit(X[train_index], y[train_index])
    y_pred = clf.predict(X[test_index])
    f1s.append(f1_score(y[test_index], y_pred))
f1s = np.array(f1s)

In [128]:
print(f"{f1s.mean():.2%} ({f1s.std():.2%})")

78.95% (6.54%)
