In [None]:
nltk.download('stopwords')

In [None]:
import os
import re

import pandas as pd
import matplotlib.pyplot as plt
import seaborn

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:

class DataLoader:
    def load_documents(self, directory):
        documents = []
        for filename in os.listdir(directory):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                documents.append((filename, text))
        return documents
        


In [None]:
class TextPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        text = re.sub(r'\W', ' ', text)  # Remove special characters
        text = text.lower()  # Convert to lowercase
        return text

    def tokenize(self, text):
        tokens = text.split()
        return [self.stem(token) for token in tokens if token not in self.stop_words]

    def stem(self, word):
        return self.stemmer.stem(word)

    def preprocess_documents(self, documents):
        processed_docs = []
        for filename, text in documents:
            cleaned_text = self.clean_text(text)
            tokens = self.tokenize(cleaned_text)
            processed_docs.append((filename, tokens))
        return processed_docs


In [None]:
class Vectorizer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def fit_transform(self, texts):
        return self.vectorizer.fit_transform(texts)

    def transform(self, texts):
        return self.vectorizer.transform(texts)


In [None]:
class Model:
    def __init__(self):
        self.model = LogisticRegression(max_iter=1000)

    def train(self, X, y):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(self.X_train, self.y_train)

    def evaluate(self):
        predictions = self.model.predict(self.X_test)
        print(classification_report(self.y_test, predictions))

    def tune_hyperparameters(self, param_grid):
        grid_search = GridSearchCV(self.model, param_grid, cv=5)
        grid_search.fit(self.X_train, self.y_train)
        print("Best parameters found: ", grid_search.best_params_)
        return grid_search.best_estimator_


In [None]:
class EDA:
    def __init__(self, data):
        self.data = data

    def visualize_data_distribution(self):
        seaborn.countplot(y='category', data=self.data)
        plt.title('Distribution of Article Categories')
        plt.show()

    def calculate_statistics(self):
        lengths = self.data['text'].apply(lambda x: len(x.split()))
        print(f'Mean length of articles: {lengths.mean()}')
        print(f'Median length of articles: {lengths.median()}')


In [None]:
class Bootstrapping:
    def __init__(self, model):
        self.model = model

    def retrain_on_misclassified(self, X_test, y_test):
        predictions = self.model.predict(X_test)
        misclassified_indices = [i for i, (pred, true) in enumerate(zip(predictions, y_test)) if pred != true]
        
        if misclassified_indices:
            X_misclassified = X_test[misclassified_indices]
            y_misclassified = y_test[misclassified_indices]
            self.model.fit(X_misclassified, y_misclassified)
            print("Model retrained on misclassified articles.")


In [None]:
def main():
# Load Data
    loader = DataLoader()
    documents = loader.load_documents('data/your_financial_articles_directory/')
    
# Preprocessing
    preprocessor = TextPreprocessor()
    processed_documents = preprocessor.preprocess_documents(documents)
    
# Creating a DataFrame for unlabeled data
    data = pd.DataFrame(processed_documents, columns=['filename', 'tokens'])
    data['text'] = data['tokens'].apply(lambda x: ' '.join(x))  # Join tokens back to text

# Load labeled categories
    categories_df = pd.read_csv('data/categories.csv')

# Vectorization
    vectorizer = Vectorizer()
    X_labeled = vectorizer.fit_transform(categories_df['text'])
    y_labeled = categories_df['category']

# Train the model on the labeled data
    model = Model()
    model.train(X_labeled, y_labeled)

# Predict categories for the unlabeled dataset
    X_unlabeled = vectorizer.transform(data['text'])
    predictions = model.model.predict(X_unlabeled)

# Add predictions to the DataFrame
    data['category'] = predictions

# Display predictions to check results
    print(data[['filename', 'category']])

# Check distribution of predicted categories
    print("Predicted category distribution:")
    print(data['category'].value_counts())

# Exploratory Data Analysis
    eda = EDA(data)
    eda.visualize_data_distribution()
    eda.calculate_statistics()

# Evaluate model performance (optional)
    model.evaluate()


In [None]:
main()