In [1]:
#@title Installs

!pip install pydot --quiet
!pip install gensim --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text --quiet
!pip install transformers --quiet

In [2]:
#!pip install tensorflow_text

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
#import tensorflow_text as tf_text
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertModel
from transformers import logging
logging.set_verbosity_error()

from sklearn.feature_extraction.text import TfidfVectorizer

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import re

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

In [3]:
import pandas as pd

file_paths = ['/content/AM.csv', '/content/CF.csv', '/content/DB.csv', '/content/IB.csv']

df = [pd.read_csv(file_path) for file_path in file_paths]

merged = pd.concat(df, ignore_index = True)

print(merged)

       Label                                      Risk Sentence
0          0  risk factor risk factor normal course business...
1          0  following discussion material risk factor appl...
2          0  additional information risk management framewo...
3          0  additional risk beyond described management di...
4          0  strategic risk consummation planned acquisitio...
...      ...                                                ...
25532      2  dilution reduction delay accretion schwabs ear...
25533      2  pending merger td ameritrade completed stockho...
25534      2  proposed merger completed issue td ameritrade ...
25535      2  result issuance share common stock stockholder...
25536      2        addition td bank become largest stockholder

[25537 rows x 2 columns]


In [4]:
merged['Label'] = merged['Label'].astype(int)
label_counts = merged['Label'].value_counts()
print(label_counts)

2    7020
3    6610
0    6325
1    5582
Name: Label, dtype: int64


In [5]:
nltk.download('word2vec_sample')

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

model = KeyedVectors.load_word2vec_format(datapath(word2vec_sample), binary=False)

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [5]:
#Split into test/train

X = merged['Risk Sentence']
y = merged['Label']



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [7]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
svm_model = SVC(kernel='linear', C=1.0)  # Linear kernel, adjust C as needed
svm_model.fit(X_train_tfidf, y_train)

In [18]:
y_pred = svm_model.predict(X_test_tfidf)

# Print classification report and accuracy score
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.71      0.70      1279
           1       0.81      0.73      0.77      1046
           2       0.72      0.75      0.74      1412
           3       0.79      0.81      0.80      1371

    accuracy                           0.75      5108
   macro avg       0.76      0.75      0.75      5108
weighted avg       0.75      0.75      0.75      5108

Accuracy: 0.7521534847298356


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [10]:
vectorizer = CountVectorizer(max_features=1000)  # You can adjust max_features as needed
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [21]:
# Initialize and train the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)  # You can adjust max_iter as needed
logreg_model.fit(X_train_bow, y_train)

# Predict labels for the test set
y_pred = logreg_model.predict(X_test_bow)

# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.69      0.71      1279
           1       0.81      0.75      0.78      1046
           2       0.72      0.75      0.74      1412
           3       0.78      0.82      0.80      1371

    accuracy                           0.76      5108
   macro avg       0.76      0.75      0.76      5108
weighted avg       0.76      0.76      0.76      5108



In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import classification_report

# Load FinBERT pre-trained model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
risk_sentences = merged['Risk Sentence'].tolist()
true_label = merged['Label'].tolist()
encoded_inputs = tokenizer(risk_sentences, padding=True, truncation=True, return_tensors='pt')
input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']

# Perform inference
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask)[0]

# Obtain predicted labels
predicted_labels = torch.argmax(logits, dim=1)

# Calculate evaluation metrics
classification_rep = classification_report(true_label, predicted_labels)
print(classification_rep)