<a href="https://colab.research.google.com/github/YounSooKimTech/self_study/blob/main/202309_UN_Risk_Labeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training the model with BOS_RISK

In [44]:
import pandas as pd

df_train = pd.read_excel("/content/BOS_RISKS (1).xlsx")
df_train.head()

df_train = df_train[df_train["Language"] == "English"]
df_train['RISK_DESCRIPTION'] = df_train['RISK_DESCRIPTION'].fillna("")

In [47]:
search_word = "funding"
df_train[df_train["RISK_DESCRIPTION"].str.contains(search_word)][["RISK_DESCRIPTION", "Standard_risk_L2"]]


Unnamed: 0,RISK_DESCRIPTION,Standard_risk_L2
404,Budget cuts and/or availability of funding for...,Funding
415,Reduced funding for activity implementation.,Funding
473,Lack of provision of funding for this activity,Funding


In [4]:
df_train["Standard_risk_L2"].value_counts()

df_train['RISK_DESCRIPTION'].fillna("", inplace=True)
df_train["Standard_risk_L2"].fillna("", inplace=True)

df_train.head()

Unnamed: 0,COUNTRY,Max of REVIEW_YEAR,RISK_DESCRIPTION,Standard_risk_L2,Standard_risk_L1,LIKEHOOD,IMPACT,OVERALL_RISK,RISK_TREATMENT,MITIGATION_MEASURES,Language
2,Algeria,2023,Staff Turnover or change in Profile for key po...,Operational capacity,"2. Operational factors (funding, capacity and ...",Highly Possible,Significant Concern,Very High,Control,RC OFFICE ensure backup and follow up,English
4,Algeria,2023,Delays in implementing joint premises projects...,Agencies' commitment and participation,"1. Agencies alignment, commitment and particip...",Possible,Minor,Medium,Accept,monitor the case at UNCT level,English
5,Algeria,2023,Market service disruptions due to the impact o...,COVID-19,"3. External factors (COVID-19, economic, legis...",Possible,Slight Concern,Medium,Monitor,proactive action thru LTA and planning with Su...,English
7,Algeria,2023,Insufficient supplier capacity to handle incre...,Operational capacity,"2. Operational factors (funding, capacity and ...",Highly Possible,Slight Concern,High,Monitor,sensitize UNCT on capacity reinforcement,English
8,Angola,2023,Absence of common sharepoint would lead to ina...,Interoperability and data availability,"2. Operational factors (funding, capacity and ...",Negligible,Slight Concern,Medium,Control,Continue the maintenance of the Common Sharepo...,English


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize_lemma_and_remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop and not token.like_num]
    tokens = [token for token in tokens if token.strip()]
    return " ".join(tokens)

df_train["text"] = df_train["RISK_DESCRIPTION"].apply(tokenize_lemma_and_remove_stopwords_and_punctuation)

df_train.head()

Unnamed: 0,COUNTRY,Max of REVIEW_YEAR,RISK_DESCRIPTION,Standard_risk_L2,Standard_risk_L1,LIKEHOOD,IMPACT,OVERALL_RISK,RISK_TREATMENT,MITIGATION_MEASURES,Language,text
2,Algeria,2023,Staff Turnover or change in Profile for key po...,Operational capacity,"2. Operational factors (funding, capacity and ...",Highly Possible,Significant Concern,Very High,Control,RC OFFICE ensure backup and follow up,English,staff turnover change profile key position omt...
4,Algeria,2023,Delays in implementing joint premises projects...,Agencies' commitment and participation,"1. Agencies alignment, commitment and particip...",Possible,Minor,Medium,Accept,monitor the case at UNCT level,English,delay implement joint premise project un houses
5,Algeria,2023,Market service disruptions due to the impact o...,COVID-19,"3. External factors (COVID-19, economic, legis...",Possible,Slight Concern,Medium,Monitor,proactive action thru LTA and planning with Su...,English,market service disruption impact covid19
7,Algeria,2023,Insufficient supplier capacity to handle incre...,Operational capacity,"2. Operational factors (funding, capacity and ...",Highly Possible,Slight Concern,High,Monitor,sensitize UNCT on capacity reinforcement,English,insufficient supplier capacity handle increase...
8,Angola,2023,Absence of common sharepoint would lead to ina...,Interoperability and data availability,"2. Operational factors (funding, capacity and ...",Negligible,Slight Concern,Medium,Control,Continue the maintenance of the Common Sharepo...,English,absence common sharepoint lead inability effec...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


# Load your dataset (X_train, y_train, X_val, y_val, X_test, y_test)
X = df_train['text']
y = df_train['Standard_risk_L2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Evaluate on the validation set
y_test_pred = clf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Model Accuracy: {accuracy}")


Model Accuracy: 0.46153846153846156


In [7]:
# Random forest model
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

y_test_pred = rf_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.6615384615384615


In [8]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train_tfidf, y_train)

y_test_pred = svm_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.6


# Apply the model from previous training

In [51]:
df_test = pd.read_excel('/content/SERVICES_RISKS.xlsx')
s_df_test = df_test[["COUNTRY","RISK_DESCRIPTION", "Standard_risk_L2"]]

In [52]:
!pip install langdetect

from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"  # In case language detection fails

s_df_test["language"] = s_df_test["RISK_DESCRIPTION"].apply(detect_language)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s_df_test["language"] = s_df_test["RISK_DESCRIPTION"].apply(detect_language)


In [53]:
s_df_test.language.value_counts()/len(s_df_test)

en         0.849894
es         0.067653
fr         0.050740
no         0.013742
de         0.005285
tl         0.003171
it         0.003171
da         0.002114
ro         0.001057
ca         0.001057
af         0.001057
unknown    0.001057
Name: language, dtype: float64

In [55]:


s_df_test[s_df_test.language != "en"].to_excel('lang_labeled.xlsx', index=False)

from google.colab import files
files.download('lang_labeled.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


# Load your dataset (X_train, y_train, X_val, y_val, X_test, y_test)
X = df_train['text']
y = df_train['Standard_risk_L2']

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X)

# random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y)

# preprocess the test dataset
s_en_df_test = s_df_test[s_df_test["language"] == "en"]
X_en_test = s_en_df_test["RISK_DESCRIPTION"]
X_en_test_tfidf = tfidf_vectorizer.transform(X_en_test)

predicted_labels = rf_classifier.predict(X_en_test_tfidf)
s_en_df_test["Standard_risk_L2"] = predicted_labels


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s_en_df_test["Standard_risk_L2"] = predicted_labels


In [87]:
# Input text
input_text = ["Service disruption due to natural disasters. Agencies not participating."]

# Vectorize the input text using the pre-trained TF-IDF vectorizer
X_sample_text = tfidf_vectorizer.transform(input_text)

# Predict the label using the trained Random Forest classifier
predicted_label = rf_classifier.predict(X_sample_text)

# Print the predicted label
print(f"Predicted Label: {predicted_label[0]}")


Predicted Label: Underutilization


In [36]:
s_en_df_test.head()

s_en_df_test.to_excel('en_labels.xlsx', index=False)

from google.colab import files
files.download('en_labels.xlsx')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Bert Classifier

In [1]:
import gensim.downloader as api
model_name = "word2vec-google-news-300"
model = api.load(model_name)




In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec  # Example using Word2Vec

# Load your dataset (X_train, y_train, X_test, y_test)
X = df_train['text']
y = df_train['Standard_risk_L2']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Function to convert a list of words into a fixed-length vector
def text_to_vector(text, model, num_features):
    text_vector = np.zeros((num_features,), dtype="float32")
    num_words = 0
    for word in text:
        if word in model:
            text_vector = np.add(text_vector, model[word])
            num_words += 1
    if num_words > 0:
        text_vector = np.divide(text_vector, num_words)
    return text_vector

# Convert text data to word vectors
num_features = word2vec_model.vector_size
X_train_word_vectors = [text_to_vector(text.split(), word2vec_model, num_features) for text in X_train]
X_test_word_vectors = [text_to_vector(text.split(), word2vec_model, num_features) for text in X_test]

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_word_vectors, y_train)

# Evaluate on the test set
y_test_pred = clf.predict(X_test_word_vectors)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Model Accuracy: {accuracy}")


