In [10]:
import re
import string
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK data is downloaded
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Custom Kazakh conjunctions and stopwords
kazakh_conjunctions = {"мен", "да", "де", "та", "те", "және", "біресе", "бірде", "бірақ", "алайда", "себебі"}

# Custom list of Kazakh finance-related terms
kazakh_finance_terms = {
    "фискалдық", "инвестициялар", "капитал", "салық", "табыс", "қор", "экономика", "нарық", "тәуекел"
}

# Predefined list of finance-related terms in English
finance_terms = {
    "stock", "bond", "investment", "capital", "market", "equity", "portfolio", "dividend", "asset", "liability",
    "revenue", "profit", "interest", "currency", "economy", "inflation", "deflation", "fiscal", "debt", "loan", 
    "tax", "fund", "risk", "yield", "share", "trading", "broker", "capitalization", "finance", "depreciation", "credit"
}

def check_kaznlp_structure(kaznlp_path):
    """Check if the kaznlp package has an __init__.py file and create one if missing."""
    init_file = os.path.join(kaznlp_path, "__init__.py")
    if not os.path.exists(init_file):
        with open(init_file, "w", encoding="utf-8") as f:
            f.write("# KazNLP package initialization\n")
        print("__init__.py file was missing and has been created.")
    else:
        print("__init__.py file exists.")

def load_text(file_name):
    """Load text from a file."""
    if not os.path.exists(file_name):
        return ""  # Return empty string instead of printing an error
    with open(file_name, 'r', encoding='utf-8') as file:
        return file.read()

def preprocess_text(text, language="en"):
    """Preprocess text for the specified language."""
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\b\d+%\b", "", text)  # Remove numbers with percentage signs as a whole
    text = remove_numbers(text)  # Remove numbers
    tokens = word_tokenize(text)
    
    # Stop words and custom conjunctions handling
    removed_stop_words = []
    if language == "kk":
        stop_words = set(stopwords.words("russian"))  # Use Russian stopwords as a close alternative
        tokens = [word for word in tokens if word not in stop_words and word not in kazakh_conjunctions]
        removed_stop_words = [word for word in tokens if word in stop_words or word in kazakh_conjunctions]
    
    elif language == "en":
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]
        removed_stop_words = [word for word in tokens if word in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Annotate finance terms in the tokens
    annotated_text = annotate_finance_terms(tokens, language)
    
    return annotated_text, removed_stop_words

def remove_numbers(text):
    """Remove numbers from the text."""
    return re.sub(r'\d+', '', text)

def annotate_finance_terms(tokens, language):
    """Annotate finance-related terms in the text."""
    annotated_tokens = []
    if language == "kk":
        # Use Kazakh-specific finance terms for annotation
        for token in tokens:
            if token in kazakh_finance_terms:
                annotated_tokens.append(f"<қаржы>{token}</қаржы>")
            else:
                annotated_tokens.append(token)
    else:
        # Use English finance terms for annotation
        for token in tokens:
            if token in finance_terms:
                annotated_tokens.append(f"<finance>{token}</finance>")
            else:
                annotated_tokens.append(token)
    return annotated_tokens

def extract_annotated_terms(tokens):
    """Extract the annotated finance terms."""
    annotated_terms = [token for token in tokens if "<қаржы>" in token or "<finance>" in token]
    return annotated_terms

def save_annotated_terms(file_name, annotated_terms):
    """Save annotated finance terms to a file."""
    with open(file_name, "w", encoding="utf-8") as f:
        f.write(" ".join(annotated_terms))

# Example Usage
kazakh_text = load_text("kazakh_language.txt")
english_text = load_text("english_language.txt")

# Process the texts
processed_kazakh_tokens, kazakh_removed_stop_words = preprocess_text(kazakh_text, language="kk")
processed_english_tokens, english_removed_stop_words = preprocess_text(english_text, language="en")

# Extract the annotated finance terms
kazakh_finance_terms = extract_annotated_terms(processed_kazakh_tokens)
english_finance_terms = extract_annotated_terms(processed_english_tokens)

# Print the annotated finance terms separately
print("\nAnnotated Finance Terms in Kazakh Text:")
print(" ".join(kazakh_finance_terms))

print("\nAnnotated Finance Terms in English Text:")
print(" ".join(english_finance_terms))

# Print the preprocessed text
print("\nPreprocessed Kazakh Text:")
print(" ".join(processed_kazakh_tokens))

print("\nPreprocessed English Text:")
print(" ".join(processed_english_tokens))

# Print removed stop words
print("\nRemoved Stop Words from Kazakh Text:")
print(" ".join(kazakh_removed_stop_words))

print("\nRemoved Stop Words from English Text:")
print(" ".join(english_removed_stop_words))

# Save the annotated terms into separate files
save_annotated_terms("annotated_kazakh_terms.txt", kazakh_finance_terms)
save_annotated_terms("annotated_english_terms.txt", english_finance_terms)

# Save the processed text
with open("processed_kazakh.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(processed_kazakh_tokens))

with open("processed_english.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(processed_english_tokens))

print("\nPreprocessing complete. Processed files saved.")



Annotated Finance Terms in Kazakh Text:
<қаржы>экономика</қаржы> <қаржы>фискалдық</қаржы> <қаржы>фискалдық</қаржы> <қаржы>экономика</қаржы> <қаржы>салық</қаржы> <қаржы>капитал</қаржы> <қаржы>салық</қаржы> <қаржы>салық</қаржы> <қаржы>салық</қаржы> <қаржы>салық</қаржы> <қаржы>қор</қаржы> <қаржы>салық</қаржы> <қаржы>қор</қаржы> <қаржы>нарық</қаржы> <қаржы>инвестициялар</қаржы> <қаржы>инвестициялар</қаржы> <қаржы>капитал</қаржы> <қаржы>инвестициялар</қаржы> <қаржы>экономика</қаржы> <қаржы>экономика</қаржы> <қаржы>қор</қаржы> <қаржы>салық</қаржы> <қаржы>салық</қаржы> <қаржы>салық</қаржы> <қаржы>салық</қаржы>

Annotated Finance Terms in English Text:
<finance>fiscal</finance> <finance>investment</finance> <finance>capital</finance> <finance>fiscal</finance> <finance>tax</finance> <finance>investment</finance> <finance>tax</finance> <finance>fund</finance> <finance>tax</finance> <finance>tax</finance> <finance>tax</finance> <finance>tax</finance> <finance>revenue</finance> <finance>fund</fin

[nltk_data] Downloading package punkt to /Users/alisherka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alisherka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alisherka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
import pandas as pd
import re
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

# Load processed English and Kazakh files
with open("processed_english.txt", "r", encoding="utf-8") as f:
    english_text = f.read()

with open("processed_kazakh.txt", "r", encoding="utf-8") as f:
    kazakh_text = f.read()

# Function to extract tagged financial terms
def extract_terms(text, tag):
    pattern = f"<{tag}>(.*?)</{tag}>"
    return re.findall(pattern, text)

# Extract financial terms from both languages
english_financial_terms = extract_terms(english_text, "finance")
kazakh_financial_terms = extract_terms(kazakh_text, "қаржы")

# Count occurrences
english_counts = Counter(english_financial_terms)
kazakh_counts = Counter(kazakh_financial_terms)

# Convert to DataFrame
df_english = pd.DataFrame(english_counts.items(), columns=["Term", "Frequency"])
df_kazakh = pd.DataFrame(kazakh_counts.items(), columns=["Term", "Frequency"])

# Sort by frequency
df_english = df_english.sort_values(by="Frequency", ascending=False)
df_kazakh = df_kazakh.sort_values(by="Frequency", ascending=False)

# Prepare dataset for ML modeling
all_terms = english_financial_terms + kazakh_financial_terms
labels = [1] * len(english_financial_terms) + [0] * len(kazakh_financial_terms)

# Convert text data to numerical representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(all_terms)
y = np.array(labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Train a HAMLET (Logistic Regression as a baseline alternative)
hamlet_model = LogisticRegression()
hamlet_model.fit(X_train, y_train)

# Perform cross-validation
cv_scores_xgb = cross_val_score(xgb_model, X, y, cv=5)
cv_scores_hamlet = cross_val_score(hamlet_model, X, y, cv=5)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_hamlet = hamlet_model.predict(X_test)

# Evaluate model performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

accuracy_hamlet = accuracy_score(y_test, y_pred_hamlet)
precision_hamlet = precision_score(y_test, y_pred_hamlet, average='weighted')
recall_hamlet = recall_score(y_test, y_pred_hamlet, average='weighted')
f1_hamlet = f1_score(y_test, y_pred_hamlet, average='weighted')
rmse_hamlet = np.sqrt(mean_squared_error(y_test, y_pred_hamlet))
conf_matrix_hamlet = confusion_matrix(y_test, y_pred_hamlet)

# Save results to CSV
df_english.to_csv("english_financial_terms.csv", index=False)
df_kazakh.to_csv("kazakh_financial_terms.csv", index=False)

# Display evaluation metrics
print("XGBoost Model Evaluation:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall: {recall_xgb:.4f}")
print(f"F1-score: {f1_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print("Confusion Matrix:")
print(conf_matrix_xgb)

print("\nHAMLET Model Evaluation (Logistic Regression):")
print(f"Accuracy: {accuracy_hamlet:.4f}")
print(f"Precision: {precision_hamlet:.4f}")
print(f"Recall: {recall_hamlet:.4f}")
print(f"F1-score: {f1_hamlet:.4f}")
print(f"RMSE: {rmse_hamlet:.4f}")
print("Confusion Matrix:")
print(conf_matrix_hamlet)

# Display cross-validation results
print("\nCross-Validation Scores for XGBoost:", cv_scores_xgb)
print("Mean CV Accuracy (XGBoost):", np.mean(cv_scores_xgb))
print("\nCross-Validation Scores for HAMLET:", cv_scores_hamlet)
print("Mean CV Accuracy (HAMLET):", np.mean(cv_scores_hamlet))

# Display top 10 frequent terms in both languages
print("\nTop 10 English Financial Terms:")
print(df_english.head(10))

print("\nTop 10 Kazakh Financial Terms:")
print(df_kazakh.head(10))


XGBoost Model Evaluation:
Accuracy: 0.8947
Precision: 0.9071
Recall: 0.8947
F1-score: 0.8805
RMSE: 0.3244
Confusion Matrix:
[[ 2  2]
 [ 0 15]]

HAMLET Model Evaluation (Logistic Regression):
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
RMSE: 0.0000
Confusion Matrix:
[[ 4  0]
 [ 0 15]]

Cross-Validation Scores for XGBoost: [0.78947368 0.94444444 0.77777778 0.72222222 0.94444444]
Mean CV Accuracy (XGBoost): 0.835672514619883

Cross-Validation Scores for HAMLET: [0.89473684 0.94444444 0.83333333 0.83333333 1.        ]
Mean CV Accuracy (HAMLET): 0.9011695906432748

Top 10 English Financial Terms:
          Term  Frequency
1   investment         29
6      economy         11
3          tax          6
10     finance          4
7       market          3
0       fiscal          2
2      capital          2
4         fund          2
12       share          2
5      revenue          1

Top 10 Kazakh Financial Terms:
            Term  Frequency
2          салық         10
0   

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

