This is the background work and documentation of the "Language Recognition" project for CS3120. This serves mainly as an explanation as well as a document for me to look back on.
Special thanks to the following for collecting datasets used within this project.
- https://github.com/adbar/German-NLP/blob/master/README.md
- https://www.merlin-platform.eu/
This project looks at words in the German language and will give an estimate on fluency.

# Part 1: Dataset Preparation.
There are multiple datasets that will be used in this project. They are located in /TrainingData. They will each need to be extracted in order to get the appropriate information. 

In [1]:
import csv
import re
import os

def extract_metadata(text):
    # Define patterns for each metadata field
    patterns = {
        "Test language": r"Test language: (.+)",
        "CEFR level of test": r"CEFR level of test: (.+)",
        "Mother tongue": r"Mother tongue: (.+)",
        "Overall CEFR rating": r"Overall CEFR rating: (.+)",
        "Grammatical accuracy": r"Grammatical accuracy: (.+)",
        "Orthography": r"Orthography: (.+)",
        "Vocabulary range": r"Vocabulary range: (.+)",
        "Vocabulary control": r"Vocabulary control: (.+)",
        "Coherence/Cohesion": r"Coherence/Cohesion: (.+)",
        "Sociolinguistic appropriateness": r"Sociolinguistic appropriateness: (.+)",
        "Learner text": r"Learner text:\s+([\s\S]+?)$"  # Match all text after "Learner text:"
    }
    
    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        extracted_data[key] = match.group(1).strip() if match else ""
    return extracted_data

def process_folder(input_folder, output_file):
    # List all .txt files in the folder
    files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]
    if not files:
        print("No .txt files found in the specified folder.")
        return

    # Process each file and collect data
    all_data = []
    for file in files:
        file_path = os.path.join(input_folder, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        metadata = extract_metadata(text)
        metadata['Filename'] = file  # Add filename for reference
        all_data.append(metadata)
    
    # Write all data to a single CSV
    with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = list(all_data[0].keys())  # Use keys from the first entry
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)

    print(f"Data from {len(files)} files extracted and written to {output_file}")

def main():
    input_folder = "TrainingData"  # Containing .txt files
    output_file = "trainingParsed.csv"     # Desired output CSV file

    process_folder(input_folder, output_file)

if __name__ == "__main__":
    main()


Data from 1033 files extracted and written to trainingParsed.csv


In [2]:
import pandas as pd

df = pd.read_csv('trainingParsed.csv')
df.head()

Unnamed: 0,Test language,CEFR level of test,Mother tongue,Overall CEFR rating,Grammatical accuracy,Orthography,Vocabulary range,Vocabulary control,Coherence/Cohesion,Sociolinguistic appropriateness,Learner text,Filename
0,German,C1,Russian,B2,B2,C1,C1,C1,C1,B2,"Ich begrüße alle, der sich für das Thema „Länd...",1031_0001950.txt
1,German,A2,Russian,A2,B1,B2,B1,B1,B2,B2,"Sehr geehrt Frau Schmidt, ich bin ein paar Tag...",1091_0000062.txt
2,German,A2,not reported,B1,B1,B2,B1,B1,B1,B1,Liebe Julia,1091_0000002.txt
3,German,C1,Spanish,B2,B1,B2,B2,B2,B2,B2,"Meine Meinung nach ist sinnlos, auch in Auslan...",1031_0002083.txt
4,German,B1,Portuguese,A2,A2,B1,B1,A2,A2,A2,"Stadt X, 24.02.2012",1061_0120323.txt


In [3]:
df.describe()

Unnamed: 0,Test language,CEFR level of test,Mother tongue,Overall CEFR rating,Grammatical accuracy,Orthography,Vocabulary range,Vocabulary control,Coherence/Cohesion,Sociolinguistic appropriateness,Learner text,Filename
count,1033,1033,1033,1033,1033,1033,1033,1033,1033,1033,1033,1033
unique,1,5,15,6,7,7,7,7,7,7,744,1033
top,German,B1,not reported,B1,B1,B2,B1,B1,B1,B2,"Lieber Jens,",1091_0000050.txt
freq,1033,210,275,331,352,338,349,321,356,347,20,1


# 1.1 Reading and understanding the data

In [5]:
import seaborn as sns

sns.scatterplot(data=df, x='CEFR_level_of_test', y='Overall_CEFR_rating')

ModuleNotFoundError: No module named 'seaborn'