Implementation of a file with 4 ".txt" type of files, and analysis through its contect and feature prediction by using semmantics, embedding and vectorization methods.

In [1]:
import re

# Path to your text file
file_path = r"C:/Users/u651278/Downloads/text files/pc_jabberwocky.txt"  # Update with the correct path

# Initialize counters
comma_count = 0
capital_letter_count = 0

# Read the file and count commas and capital letters
try:
    with open(file_path, 'r') as file:
        content = file.read()
        
        # Count commas
        comma_count = content.count(',')
        
        # Count capital letters using regular expressions
        capital_letter_count = len(re.findall(r'[A-Z]', content))
        
    print(f"Number of commas: {comma_count}")
    print(f"Number of capital letters: {capital_letter_count}")
    
except FileNotFoundError:
    print(f"Error: {file_path} not found.")


Number of commas: 16
Number of capital letters: 49


We see the output of commas and capitals for one of the files, which we care about to see if the system can work with them for prediction.

In [2]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Path to your text file
file_path = r"C:\Users\YourName\Documents\text files\your_file.txt"  # Update with the correct path

# Read the content of the file
try:
    with open(file_path, 'r') as file:
        content = file.read()
except FileNotFoundError:
    print(f"Error: {file_path} not found.")
    exit()

# Prepare the features and labels
window_size = 5  # Number of previous characters used to predict the next one

X = []  # Features (text snippets)
y = []  # Labels (comma or capital letter)

# Function to check if a character is a comma or a capital letter
def is_comma_or_capital(char):
    if char == ',':
        return 1  # Comma
    elif char.isupper():
        return 2  # Capital letter
    return 0  # Neither comma nor capital letter

# Loop through the content and prepare data
for i in range(len(content) - window_size):
    # Create a feature vector for the previous 'window_size' characters
    window = content[i:i + window_size]
    
    # Create the target variable: what comes next (comma or capital letter)
    next_char = content[i + window_size]
    
    # Add the feature (text snippet)
    X.append(window)  # Use text snippets as features
    
    # Add the label for the next character (comma or capital letter)
    y.append(is_comma_or_capital(next_char))

# Convert to pandas DataFrame for easier manipulation
X = np.array(X)
y = np.array(y)

# Use TF-IDF to vectorize the text snippets (features)
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))  # Using character bigrams and trigrams
X_tfidf = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))



Error: C:\Users\YourName\Documents\text files\your_file.txt not found.
Accuracy: 0.9369
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       208
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        10

    accuracy                           0.94       222
   macro avg       0.31      0.33      0.32       222
weighted avg       0.88      0.94      0.91       222



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


: 

By embbeding and vectorizing our text file, the system was propperly trained and achieved a .9369 accuracy for commas and capital letters prediction. 

In [2]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report

# Path to your text file
file_path = r"C:/Users/u651278/Downloads/text files/pc_jabberwocky.txt"  # Update with the correct path

# Read the content of the file
try:
    with open(file_path, 'r') as file:
        content = file.read()
except FileNotFoundError:
    print(f"Error: {file_path} not found.")
    exit()

# Prepare the features and labels
window_size = 5  # Number of previous characters used to predict the next one

X = []  # Features (text snippets)
y = []  # Labels (comma or capital letter)

# Function to check if a character is a comma or a capital letter
def is_comma_or_capital(char):
    if char == ',':
        return 1  # Comma
    elif char.isupper():
        return 2  # Capital letter
    return 0  # Neither comma nor capital letter

# Loop through the content and prepare data
for i in range(len(content) - window_size):
    # Create a feature vector for the previous 'window_size' characters
    window = content[i:i + window_size]
    
    # Create the target variable: what comes next (comma or capital letter)
    next_char = content[i + window_size]
    
    # Add the feature (text snippet)
    X.append(window)  # Use text snippets as features
    
    # Add the label for the next character (comma or capital letter)
    y.append(is_comma_or_capital(next_char))

# Convert to pandas DataFrame for easier manipulation
X = np.array(X)
y = np.array(y)

# Use TF-IDF to vectorize the text snippets (features)
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))  # Using character bigrams and trigrams
X_tfidf = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=[1, 2])  # Comma (1) and Capital letter (2)
print(f"Accuracy: {accuracy:.4f}")


# Print classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9369
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       208
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        10

    accuracy                           0.94       222
   macro avg       0.31      0.33      0.32       222
weighted avg       0.88      0.94      0.91       222



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Folder is already open in VSCode, so use the relative path
folder_path = "C:/Users/u651278/Downloads/text files"  # Use relative path if folder is within the workspace

def read_text_files_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Only consider .txt files
            file_path = os.path.join(folder_path, filename)
            try:
                # Use 'ignore' or 'replace' to handle invalid characters
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    texts.append(file.read())  # Add the content of each file
            except FileNotFoundError:
                print(f"Error: {file_path} not found.")
    return texts


# Read the content of all text files in the folder
texts = read_text_files_from_folder(folder_path)

# Prepare the features and labels
window_size = 5  # Number of previous characters used to predict the next one

X = []  # Features (text snippets)
y = []  # Labels (comma or capital letter)

# Function to check if a character is a comma or a capital letter
def is_comma_or_capital(char):
    if char == ',':
        return 1  # Comma
    elif char.isupper():
        return 2  # Capital letter
    return 0  # Neither comma nor capital letter

# Loop through the content of all files and prepare data
for text in texts:
    for i in range(len(text) - window_size):
        # Create a feature vector for the previous 'window_size' characters
        window = text[i:i + window_size]
        
        # Create the target variable: what comes next (comma or capital letter)
        next_char = text[i + window_size]
        
        # Add the feature (text snippet)
        X.append(window)  # Use text snippets as features
        
        # Add the label for the next character (comma or capital letter)
        y.append(is_comma_or_capital(next_char))

# Convert to numpy arrays for model compatibility
X = np.array(X)
y = np.array(y)

# Use TF-IDF to vectorize the text snippets (features)
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))  # Using character bigrams and trigrams
X_tfidf = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9476
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       578
           1       0.00      0.00      0.00         7
           2       1.00      0.04      0.07        26

    accuracy                           0.95       611
   macro avg       0.65      0.35      0.35       611
weighted avg       0.94      0.95      0.92       611



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


We used the same idea for processing languages in the system and the accuracy raises from .9369 to .9476 meaning we are heading in the correct direction to improve our model.

One fo the files has manny special characters, cleaning and post rerunning of the model will be done.

In [1]:
pip install clean-text


Defaulting to user installation because normal site-packages is not writeable
Collecting clean-text
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting emoji<2.0.0,>=1.0.0 (from clean-text)
  Downloading emoji-1.7.0.tar.gz (175 kB)
     ---------------------------------------- 0.0/175.4 kB ? eta -:--:--
     -- ------------------------------------- 10.2/175.4 kB ? eta -:--:--
     -------------------- ------------------ 92.2/175.4 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 175.4/175.4 kB 3.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting ftfy<7.0,>=6.0 (from clean-text)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
   ---------------------------------------- 0.0/44.8 kB ? eta -:--:--
   ---------------------------------------- 44.8/44.8 kB ?



In [7]:
# Try reading the file with a different encoding (ISO-8859-1 or Windows-1252)
file_path = r"C:/Users/u651278/Downloads/text files/pc_erlkonig.txt"

# Attempt to open the file using 'ISO-8859-1' encoding
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    text = file.read()

print(text)


Erlkönig
J.W. Goethe

Wer reitet so spät durch Nacht und Wind?
Es ist der Vater mit seinem Kind;
Er hat den Knaben wohl in dem Arm,
Er faßt ihn sicher, er hält ihn warm.
  	 
«Mein Sohn, was birgst du so bang dein Gesicht?» 
Siehst, Vater, du den Erlkönig nicht?
Den Erlenkönig mit Kron und Schweif? 
«Mein Sohn, es ist ein Nebelstreif.» 
  	 
«Du liebes Kind, komm, geh mit mir!
Gar schöne Spiele spiel' ich mit dir;
Manch bunte Blumen sind an dem Strand,
Meine Mutter hat manch gülden Gewand.»
  	 
Mein Vater, mein Vater, und hörest du nicht,
Was Erlenkönig mir leise verspricht? 
«Sei ruhig, bleibe ruhig, mein Kind;
In dürren Blättern säuselt der Wind.»
  	 
«Willst, feiner Knabe, du mit mir gehn?
Meine Töchter sollen dich warten schön;
Meine Töchter führen den nächtlichen Reihn,
Und wiegen und tanzen und singen dich ein.»
  	 
Mein Vater, mein Vater, und siehst du nicht dort
Erlkönigs Töchter am düstern Ort? 
«Mein Sohn, mein Sohn, ich seh es genau:
Es scheinen die alten Weiden so g

In [8]:
# Path to your text file
file_path = r"C:/Users/u651278/Downloads/text files/pc_erlkonig.txt"

# Read the file with the correct encoding (if necessary)
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
    text = file.read()

# Remove the unwanted character "" from the text
cleaned_text = text.replace('', '')  # Replace the character with an empty string

# Print the cleaned text (optional)
print(cleaned_text)

# Optionally, save the cleaned content back to a new file
cleaned_file_path = "text files/erlkonig_cleaned.txt"
with open(cleaned_file_path, 'w', encoding='utf-8') as file:
    file.write(cleaned_text)


Erlknig
J.W. Goethe

Wer reitet so spt durch Nacht und Wind?
Es ist der Vater mit seinem Kind;
Er hat den Knaben wohl in dem Arm,
Er fat ihn sicher, er hlt ihn warm.
  	 
Mein Sohn, was birgst du so bang dein Gesicht? 
Siehst, Vater, du den Erlknig nicht?
Den Erlenknig mit Kron und Schweif? 
Mein Sohn, es ist ein Nebelstreif. 
  	 
Du liebes Kind, komm, geh mit mir!
Gar schne Spiele spiel' ich mit dir;
Manch bunte Blumen sind an dem Strand,
Meine Mutter hat manch glden Gewand.
  	 
Mein Vater, mein Vater, und hrest du nicht,
Was Erlenknig mir leise verspricht? 
Sei ruhig, bleibe ruhig, mein Kind;
In drren Blttern suselt der Wind.
  	 
Willst, feiner Knabe, du mit mir gehn?
Meine Tchter sollen dich warten schn;
Meine Tchter fhren den nchtlichen Reihn,
Und wiegen und tanzen und singen dich ein.
  	 
Mein Vater, mein Vater, und siehst du nicht dort
Erlknigs Tchter am dstern Ort? 
Mein Sohn, mein Sohn, ich seh es genau:
Es scheinen die alten Weiden so grau.
  	 
Ich liebe dich, mich reizt 

In [9]:
# Path to your text file
file_path = r"C:/Users/u651278/Downloads/text files/pc_erlkonig.txt"

# Read the file with the correct encoding (use 'utf-8' or the encoding that works)
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
    text = file.read()

# Remove the unwanted character "" from the text
cleaned_text = text.replace('', '')  # Replace the character with an empty string

# Write the cleaned text back into the same file
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

print(f"Cleaned text has been written back to {file_path}")


Cleaned text has been written back to C:/Users/u651278/Downloads/text files/pc_erlkonig.txt


We succesfully corected the file with many typos and special characters. Now we are going to re run our prediction.

In [14]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import SGDClassifier  # Import SGDClassifier for gradient descent optimization

# Folder is already open in VSCode, so use the relative path
folder_path = r"C:/Users/u651278/Downloads/text files"  # Use relative path if folder is within the workspace

def read_text_files_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Only consider .txt files
            file_path = os.path.join(folder_path, filename)
            try:
                # Use 'ignore' or 'replace' to handle invalid characters
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    texts.append(file.read())  # Add the content of each file
            except FileNotFoundError:
                print(f"Error: {file_path} not found.")
    return texts


# Read the content of all text files in the folder
texts = read_text_files_from_folder(folder_path)

# Prepare the features and labels
window_size = 5  # Number of previous characters used to predict the next one

X = []  # Features (text snippets)
y = []  # Labels (comma or capital letter)

# Function to check if a character is a comma or a capital letter
def is_comma_or_capital(char):
    if char == ',':
        return 1  # Comma
    elif char.isupper():
        return 2  # Capital letter
    return 0  # Neither comma nor capital letter

# Loop through the content of all files and prepare data
for text in texts:
    for i in range(len(text) - window_size):
        # Create a feature vector for the previous 'window_size' characters
        window = text[i:i + window_size]
        
        # Create the target variable: what comes next (comma or capital letter)
        next_char = text[i + window_size]
        
        # Add the feature (text snippet)
        X.append(window)  # Use text snippets as features
        
        # Add the label for the next character (comma or capital letter)
        y.append(is_comma_or_capital(next_char))

# Convert to numpy arrays for model compatibility
X = np.array(X)
y = np.array(y)

# Use TF-IDF to vectorize the text snippets (features)
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))  # Using character bigrams and trigrams
X_tfidf = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Use SGDClassifier for gradient descent optimization
# We can specify loss='log' for logistic regression loss function
model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9390
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       809
           1       0.67      0.15      0.25        13
           2       1.00      0.13      0.23        47

    accuracy                           0.94       869
   macro avg       0.87      0.43      0.48       869
weighted avg       0.94      0.94      0.92       869



In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# Use CountVectorizer instead of TF-IDF
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))  # Bigrams and trigrams
X_count = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)

# Use SGDClassifier for gradient descent optimization
model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9494
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       809
           1       0.70      0.54      0.61        13
           2       0.70      0.40      0.51        47

    accuracy                           0.95       869
   macro avg       0.79      0.64      0.70       869
weighted avg       0.94      0.95      0.94       869



In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_iter': [1000, 2000],
    'eta0': [0.001, 0.01, 0.1],  # Learning rates
    'alpha': [0.0001, 0.001, 0.01],  # Regularization strength
}

grid_search = GridSearchCV(SGDClassifier(loss='log_loss'), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-validation Accuracy: {grid_search.best_score_:.4f}")

# Use the best estimator to predict and evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


Best Parameters: {'alpha': 0.0001, 'eta0': 0.01, 'max_iter': 2000}
Best Cross-validation Accuracy: 0.9494
Test Accuracy: 0.9540


Lastly, implementing 2 different models after cleaning an training through our 4 .txt files and we aquired 2 higher accuracy results, .9494 and .9540. 
The second model showed the higher score for natural languge processing by implementing stochastic gradient descent models for optimization and the closest result to 1!

If you made it until here, thank you reading me :)