**This is part 2/2 for the notebooks.**

This section is for the complete code that will be used in the final product. For additional information, please visit 1_data_training.ipynb

This model was derived from all exploration of EDA and models, along with testing different approaches (including BoW, linear regression, and Naive Bayes). The approach of Linear Regression was deemed best, and used in the following code. TF-IDF (Term Frequency-Inverse Document Frequency) was also used to figure out how many times a word was used within a document, and weighted appropriately. This assisted with making a higher accuracy rating.

In [8]:
import csv
import re
import os

# These texts in the CEFRTexts folder are supplemented with additional texts.
def extract_metadata(text):
    # Define patterns for each metadata field. This is altered after the original attempt.
    patterns = {
        "Overall CEFR rating": r"Overall CEFR rating: (.+)",
        "Learner text": r"(?i)Learner text:\s+([\s\S]+?)$"  # Match all text after "Learner text:"
    }
    
    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        extracted_data[key] = match.group(1).strip() if match else ""
    return extracted_data

def process_folder(input_folder, output_file):
    # List all .txt files in the folder
    files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]
    if not files:
        print("No .txt files found in the specified folder.")
        return

    # Process each file and collect data
    all_data = []
    for file in files:
        file_path = os.path.join(input_folder, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        metadata = extract_metadata(text)
        metadata['Filename'] = file  # Add filename for reference
        all_data.append(metadata)
    
    # Write all data to a single CSV
    with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = list(all_data[0].keys())  # Use keys from the first entry
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)

    print(f"Data from {len(files)} files extracted and written to {output_file}")

def main():
    input_folder = "CEFRTexts"  # Containing .txt files
    output_file = "CEFRRaw.csv"     # Desired output CSV file

    process_folder(input_folder, output_file)

if __name__ == "__main__":
    main()


Data from 1323 files extracted and written to CEFRRaw.csv


In [9]:
import pandas as pd

df = pd.read_csv('CEFRRaw.csv')
df.head()

df.groupby('Overall CEFR rating').size()

Overall CEFR rating
A1    207
A2    306
B1    331
B2    293
C1     82
C2    104
dtype: int64

In [14]:
# Check for missing values in the entire dataframe
print(df.isnull().sum())

# Focus on the 'Learner text' column
print(df['Learner text'].isnull().sum())
print(df['Overall CEFR rating'].isnull().sum())

Overall CEFR rating    0
Learner text           0
Filename               0
cleaned_text           0
dtype: int64
0
0


In [11]:
# Check for empty or whitespace-only strings
empty_text = df[df['Learner text'].str.strip() == '']
print("Empty text rows:")
print(empty_text)

# Check for unusually short strings (e.g., less than 3 characters)
short_text = df[df['Learner text'].str.len() < 3]
print("Short text rows:")
print(short_text)

Empty text rows:
Empty DataFrame
Columns: [Overall CEFR rating, Learner text, Filename]
Index: []
Short text rows:
Empty DataFrame
Columns: [Overall CEFR rating, Learner text, Filename]
Index: []


In [12]:
# Reload the file with a specific encoding
df = pd.read_csv('CEFRRaw.csv', encoding='utf-8') 

Data is now checked. It will now be cleaned.

In [13]:
import spacy
import re

# Load German model
nlp = spacy.load("de_core_news_sm")

# Load DataFrame
df = pd.read_csv('CEFRRaw.csv')

# Function for text cleaning
def clean_text(text, apply_stemming=False, apply_lemmatization=True):
    if not isinstance(text, str):
        return text 

    # Lowercase the text
    text = text.lower()

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-zäöüß\s]', '', text)  # Retain only letters and German-specific characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace

    # Process the text using SpaCy
    doc = nlp(text)

    # Tokenize, remove stopwords, and optionally lemmatize -- these come within SpaCy
    tokens = []
    for token in doc:
        if not token.is_stop:  # Remove stopwords
            if apply_lemmatization:
                tokens.append(token.lemma_)  # Use lemmatized form
            else:
                tokens.append(token.text)  # Use raw token text

    # Join tokens back into a single string
    return ' '.join(tokens)

# Add a new column for cleaned text
df['cleaned_text'] = df['Learner text'].apply(lambda x: clean_text(x, apply_stemming=False, apply_lemmatization=True))

# Preview the original and cleaned text
print(df[['Learner text', 'cleaned_text']].head())


                                        Learner text  \
0  Das ist ein Beispiel für einen C1-Satz, der ei...   
1  Ich begrüße alle, der sich für das Thema „Länd...   
2  Sehr geehrt Frau Schmidt, ich bin ein paar Tag...   
3                                        Liebe Julia   
4  Meine Meinung nach ist sinnlos, auch in Auslan...   

                                        cleaned_text  
0  Csatz spezifisch Thema behandeln Csatz spezifi...  
1  begrüßen Thema Land Tradition interessieren Me...  
2                      ehren Frau Schmidt paar Hause  
3                                        Liebe Julia  
4  Meinung sinnlos Ausland Tradition heimatland f...  


In [18]:
# TF-IDF (Term Frequency-Inverse Document Frequency) will be used for this model. This was explored in the previous notebook.

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
# ** This code was generated using ChatGPT ** 
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to 5000 most important features
    ngram_range=(1, 2),  # Include unigrams and bigrams
    stop_words=None      # Stopword removal already handled in preprocessing
)

# Transform cleaned text to TF-IDF representation
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Check the shape of the resulting TF-IDF matrix
print(f"TF-IDF Shape: {X_tfidf.shape}")  # Rows: samples, Columns: features


TF-IDF Shape: (1323, 5000)


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Overall CEFR rating'], test_size=0.2, random_state=42)

print(f"Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")

Train Shape: (1058, 5000), Test Shape: (265, 5000)


In [20]:
# Attempting Logistic Regression with this!

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize Logistic Regression
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))

Logistic Regression Accuracy: 0.6377358490566037
Classification Report:
               precision    recall  f1-score   support

          A1       0.93      0.70      0.80        37
          A2       0.57      0.44      0.50        54
          B1       0.53      0.55      0.54        71
          B2       0.56      0.82      0.67        61
          C1       0.75      0.45      0.56        20
          C2       1.00      0.95      0.98        22

    accuracy                           0.64       265
   macro avg       0.72      0.65      0.67       265
weighted avg       0.66      0.64      0.64       265



In [21]:
# Saving the pretrained model so it can be accessed for streamlit_app.py
# PLEASE NOTE: if adjusting input data this entire notebook will need to be run again!

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to 5000 most important features
    ngram_range=(1, 2),  # Include unigrams and bigrams
    stop_words=None      # Stopword removal already handled in preprocessing
)

# Transform cleaned text to TF-IDF representation
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Save the trained TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")
print("TF-IDF vectorizer saved as tfidf_vectorizer.pkl")

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Overall CEFR rating'], test_size=0.2, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(X_train, y_train)

# Save the trained logistic regression model
joblib.dump(log_reg, "logistic_regression_model.pkl")
print("Logistic Regression model saved as logistic_regression_model.pkl")

# Evaluate the model
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))


TF-IDF vectorizer saved as tfidf_vectorizer.pkl
Logistic Regression model saved as logistic_regression_model.pkl
Logistic Regression Accuracy: 0.6377358490566037
Classification Report:
               precision    recall  f1-score   support

          A1       0.93      0.70      0.80        37
          A2       0.57      0.44      0.50        54
          B1       0.53      0.55      0.54        71
          B2       0.56      0.82      0.67        61
          C1       0.75      0.45      0.56        20
          C2       1.00      0.95      0.98        22

    accuracy                           0.64       265
   macro avg       0.72      0.65      0.67       265
weighted avg       0.66      0.64      0.64       265

