In [100]:
# Pseudo code for the text classification task
import os

import pandas as pd

import re

from sklearn.feature_extraction.text import TfidfVectorizer


import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/adamerik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adamerik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [101]:
def txt_to_list(folder_path):
    temp_list = []
    txt_files = os.listdir(folder_path)
    for txt_file in txt_files:
        with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as file:
            text_data = file.read()
            temp_list.append(text_data)
    return temp_list


def preprocess_text(text):
    # Tokenize the text (you can replace this with your tokenizer)
    tokens = nltk.word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    nltk_stopwords = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in nltk_stopwords]

    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Remove special characters and numbers
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]

    # Remove empty tokens
    tokens = [token for token in tokens if token != '']

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [102]:
### Data load
banks_folder_path = "./train/banks"
oilgas_folder_path = "./train/oilgas"
pharma_folder_path = "./train/pharma"

banks_df = pd.DataFrame(txt_to_list(banks_folder_path))
oilgas_df = pd.DataFrame(txt_to_list(oilgas_folder_path))
pharma_df = pd.DataFrame(txt_to_list(pharma_folder_path))

banks_df["class"] = "banks"
oilgas_df["class"] = "oilgas"
pharma_df["class"] = "pharma"

train_df = pd.concat([banks_df, oilgas_df, pharma_df])
train_df = train_df.rename(columns={0: "text", 1: "class",})

In [103]:
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))
print(train_df[train_df['class']=="banks"]['word_count'].mean()) #banks text
print(train_df[train_df['class']=="oilgas"]['word_count'].mean()) #oilgas text
print(train_df[train_df['class']=="pharma"]['word_count'].mean()) #pharma text

62154.94117647059
51593.17857142857
53114.64705882353


In [104]:
train_df['text'] = train_df['text'].str.lower()

In [105]:
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text)

In [106]:
train_df

Unnamed: 0,text,class,word_count,preprocessed_text
0,united states securities and exchange commiss...,banks,62149,unit state secur exchang commiss washington dc...
1,united states securities and exchange commiss...,banks,45035,unit state secur exchang commiss washington dc...
2,united states securities and exchange commiss...,banks,47117,unit state secur exchang commiss washington dc...
3,\t \t \t\t \t\t\t table of contents \t\t \t\t...,banks,60182,tabl content unit state ecur exchang commiss w...
4,\t \t united states securities and exchange c...,banks,86034,unit state secur exchang commiss washington dc...
...,...,...,...,...
29,united states securities and exchange commiss...,pharma,16682,unit state secur exchang commiss washington dc...
30,united states securities and exchange commiss...,pharma,40623,unit state secur exchang commiss washington dc...
31,united states securities and exchange commiss...,pharma,72965,unit state secur exchang commiss washington dc...
32,united states securities and exchange commiss...,pharma,36244,unit state secur exchang commiss washington dc...


In [None]:
def preprocess_text(data_path):
    # Implement text preprocessing, such as tokenization, vectorization, etc.
    pass

def train_linear_classifier(X_train, y_train):
    # Implement training of linear classifier
    pass

def validate_classifier(model, X_val, y_val):
    # Test the classifier on validation set
    pass

def test_classifier(model, X_test):
    # Make predictions with the trained model on test set
    pass

def save_predictions(predictions, output_path):
    # Save the predictions to a file
    pass

def main():
    # Step 1: Preprocessing data
    X_train, y_train = preprocess_text("path/to/train/your_chosen_category/")
    X_val, y_val = preprocess_text("path/to/validation/your_chosen_category/")
    X_test = preprocess_text("path/to/test/")

    # Step 2: Training a linear classifier
    model = train_linear_classifier(X_train, y_train)

    # Step 3: Validate the model performance
    validate_classifier(model, X_val, y_val)

    # Optional: Tune the model if validation results are not satisfactory

    # Step 4: Test the model
    predictions = test_classifier(model, X_test)

    # Step 5: Save the predictions
    save_predictions(predictions, "path/to/output/predictions.txt")

    # Additional: Write a report documenting your process and findings

# Run the main function
if __name__ == "__main__":
    main()