In [None]:
# Importing Libraries

import os
import numpy as np
import pandas as pd

import sklearn.linear_model
import
sklearn.metricsfrom sklearn.feature_extraction.text import CountVectorizer  

import re

In [28]:
def remove_words_from_string(input_string, bad_words):
    """
    Removes words from a string that also appear in a list.

    Args:
      input_string: x_train element of text.
      word_list: stop words to filter out of.

    Returns:
      A new string with the specified words removed.
    """

    words = input_string.split()  # Split the string into a list of words
    new_string = []

    for word in words:
        if word not in bad_words:  # Check if the word is in the list
            new_string.append(word)  # Add the word to the new list

    return " ".join(new_string)  # Join the words back into a string
        

In [95]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

def tune_hyperparameters_and_fit(x_train_dF, y_train_d):

    x_train_UF, x_test_VF, y_train_U, y_test_V = train_test_split(x_train_dF, y_train_d, test_size=0.2, random_state=42)

    #pipeline acts a blueprint for a logistic regression model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),                  # Feature scaling
        ('poly', PolynomialFeatures()),                # Optional polynomial features
        ('pca', PCA()),                                # Optional dimensionality reduction
        ('logistic', LogisticRegression(max_iter=1000))  # Logistic regression classifier
    ])

    # parameter grid for hyperparameter optimizing
    #commented out most things because it was really slow.
    param_grid = {
        'poly__degree': [1],
        # include a bias term in the polynomial features (interaction only)
        #'poly__interaction_only': [False, True],
        # PCA: try no dimensionality reduction or keep enough components to explain 95% variance
        #'pca__n_components': [None, 0.95],
        # penalty strength
        'logistic__C': [0.01], #0.1, 1, 10
        # lasso or ridge penalty
        'logistic__penalty': ['l2'], #'l1', 
        # Solvers that support both 'l1' and 'l2'; 'liblinear' is common for small datasets
        'logistic__solver': ['liblinear'], #'saga'  #'lbfgs' doesnt work with l1
        # idk what this does tbh
        #'logistic__class_weight': [None, 'balanced']
    }

    # Create GridSearchCV object with 5-fold cross-validation
    lr_model_trained = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', n_jobs=-1)

    # Fit grid search on training data
    lr_model_trained.fit(x_train_UF, y_train_U)

    return lr_model_trained, x_test_VF, y_test_V


In [75]:

def contains_strings(arr):
    # Check if the dtype indicates a string
    if arr.dtype.kind in ('U', 'S'):
        return True
    # For object arrays, check each element
    if arr.dtype.kind == 'O':
        return any(isinstance(x, str) for x in arr.flatten())
    return False

In [97]:
if __name__ == '__main__':
    data_dir = 'data_readinglevel'
    x_unprocessed_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
    y_unprocessed_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

    N, n_cols = x_unprocessed_df.shape
    print("Shape of x_unprocessed_df: (%d, %d)" % (N, n_cols))
    print("Shape of y_unprocessed_df: %s" % str(y_unprocessed_df.shape))

    tr_text_list = x_unprocessed_df['text'].values.tolist()

    # Make list of stopwords to filter text with
    with open("stopwords.txt", "r") as file:
        stopwords_list = file.read()
    stopwords = set(stopwords_list.splitlines())
    stopwords = list(stopwords)

    vectorizer = CountVectorizer()    # for bag of words  

    # Make list / array to actually store elements
    # text_array_N1 = np.empty((N,1))
    element_list  = []
    for element in tr_text_list:
        element = re.sub('[^A-Za-z]', ' ', element) # keep only letters and spaces
        element = element.lower()                   # convert to lowercase
        element = remove_words_from_string(element, stopwords)
        element_list.append(element)
    
    bag_of_words = vectorizer.fit_transform(element_list)
    
    #see for yourself that bag_of_words contains correct information
    #We see the first element in the dictionary is "aaron"
    #from the second print statement we see there are 6 occurences of "aaron" across all texts
    #control f'ing for "aaron" in x_train.csv shows there are indeed exactly 6 aarons
        #print("Vocabulary:", vectorizer.get_feature_names_out())
        #print(np.asarray(bag_of_words.sum(axis=0)))

    assert(bag_of_words.shape[0] == x_unprocessed_df.shape[0])
    print(x_unprocessed_df.shape)
    #we shouldn't do  bag_of_words.toarray()
    #this turns bag_of_words from a sparse matrix into a dense numpy array
    #sklearn can directly use a sparse matrix, but idk how to do it yet
    #sparse matrix will hopefully make it run a lot faster because so far its hella slow
    x_train_dF = np.column_stack((x_unprocessed_df.iloc[:, 4:].values, bag_of_words.toarray()))
    print("x_train_df shape: ")
    print(x_train_dF.shape)
    
 

    print("Contains strings: " + str(contains_strings(x_train_dF)))
    #takes the course labels column of unprocessed labels and turns each label into binary 1 or 0
    y_train_d = (y_unprocessed_df.iloc[:, 3].values.flatten() == "Key Stage 4-5").astype(int)
    print("Contains strings: " + str(contains_strings(y_train_d)))
    lr_model_trained, x_test_VF, y_test_V = tune_hyperparameters_and_fit(x_train_dF, y_train_d)

    accuracy = lr_model_trained.score(x_test_VF, y_test_V)
    print("Test Accuracy:", accuracy)
    with open("xtest.txt", "w") as file:
    for element in arr:
        file.write(str(element) + "\n")


        

# Step 1:

# Get Data (Done)
# Use bag of words to "clean" and transform / vectorize code so that
# logistic regression can actually be utilized
# Readability metrics looks useful
# Use data?

Shape of x_unprocessed_df: (5557, 32)
Shape of y_unprocessed_df: (5557, 5)
(5557, 32)
x_train_df shape: 
(5557, 25262)
Contains strings: False
Contains strings: False
Test Accuracy: 0.7733812949640287


In [None]:
    # print("tr_text_list\n", tr_text_list)
    # print("element_list\n", element_list)
    
    # Print out 8 random entries
#     prng = np.random.RandomState(101)
#     rows = prng.permutation(np.arange(y_unprocessed_df.shape[0]))
#     for row_id in rows[:8]:
#         text = tr_text_list[row_id]
#         print("row %5d | %s BY %s | y = %s" % (
#             row_id,
#             y_unprocessed_df['title'].values[row_id],
#             y_unprocessed_df['author'].values[row_id],
#             y_unprocessed_df['Coarse Label'].values[row_id],
#             ))
#         # Pretty print text via textwrap library
#         line_list = textwrap.wrap(tr_text_list[row_id],
#             width=70,
#             initial_indent='  ',
#             subsequent_indent='  ')
#         print('\n'.join(line_list))
#         print("")