Compile Features

In [11]:
text = "昨天在公园里，一位年轻的母亲带着她的孩子在草地上野餐，因为天气很好，他们用餐布铺在地上，吃着美味的食物。"

Vocabulary Richness

In [12]:
#!pip install jieba

In [13]:
from collections import Counter
import numpy as np
import jieba

# 分词结果
words = jieba.lcut(text)

# 总词数
total_words = len(words)

# 独特词汇数
unique_words = set(words)
num_unique_words = len(unique_words)

# 计算Type-Token Ratio (TTR)
ttr = num_unique_words / total_words

# 计算Root Type-Token Ratio (RTTR)
rttr = num_unique_words / np.sqrt(total_words)

# 计算Hapax Legomena Ratio
hapax_legomena = [word for word, count in Counter(words).items() if count == 1]
hapax_legomena_ratio = len(hapax_legomena) / total_words

# 计算Shannon Entropy
frequencies = Counter(words).values()
word_probs = [freq / total_words for freq in frequencies]
shannon_entropy = -sum(p * np.log2(p) for p in word_probs)

print("总词数:", total_words)
print("独特词汇数:", num_unique_words)
print("Type-Token Ratio (TTR):", ttr)
print("Root Type-Token Ratio (RTTR):", rttr)
print("Hapax Legomena Ratio:", hapax_legomena_ratio)
print("Shannon Entropy:", shannon_entropy)

# Normalize the metrics
normalized_ttr = ttr
normalized_rttr = rttr / np.sqrt(total_words)
normalized_hapax = hapax_legomena_ratio

# Normalize Shannon Entropy
max_entropy = np.log2(num_unique_words) if num_unique_words > 0 else 1
normalized_entropy = shannon_entropy / max_entropy if max_entropy > 0 else 0

# Combine normalized metrics with equal weights
vocabulary_richness_score = (
    normalized_ttr + 
    normalized_rttr + 
    normalized_hapax + 
    normalized_entropy
) / 4

# Convert to percentage
vocabulary_richness_percentage = vocabulary_richness_score * 100

# Print final score
print(f"Vocabulary Richness Score: {vocabulary_richness_score:.4f}")
print()
print(f"Vocabulary Richness Percentage: {vocabulary_richness_percentage:.2f}")


总词数: 36
独特词汇数: 28
Type-Token Ratio (TTR): 0.7777777777777778
Root Type-Token Ratio (RTTR): 4.666666666666667
Hapax Legomena Ratio: 0.6666666666666666
Shannon Entropy: 4.627986806877673
Vocabulary Richness Score: 0.7962

Vocabulary Richness Percentage: 79.62


5W1H

In [14]:
from gpt4all import GPT4All

# Initialize the model
model = GPT4All("mistral-7b-instruct-v0.1.Q4_0.gguf")

# Generate 5W1H response
prompt = (
    text +
    "based on the text, identify what are the 5W1H in Chinese"
)

# Generate response with specific parameters for consistency
#temp=0 reduce randomness
output = model.generate(prompt, temp=0)

# Print the output
print(output)

.

Who: 年轻的母亲和她的孩子
What: 在草地上野餐
Where: 公园里
When: 昨天
Why: 因为天气很好
How: 他们用餐布铺在地上，吃着美味的食物。


In [15]:
def parse_input(input_str):
    # Split the input string into lines
    lines = input_str.strip().split('\n')
    
    # Create a dictionary from the lines
    details = {}
    for line in lines:
        if ': ' in line:
            key, value = line.split(': ', 1)  # Split only on the first occurrence of ': '
            details[key] = value
        else:
            print(f"Skipping malformed line: {line}")
    
    return details

def evaluate_details(details):
    # Ensure details is a dictionary
    if not isinstance(details, dict):
        raise TypeError("Details should be a dictionary.")
    
    # Assign 1 for known, 0 for unknown ("不明")
    evaluation = {
        "Who": 1 if details.get("Who") not in ["不明", "不知道"] else 0,
        "What": 1 if details.get("What") not in ["不明", "不知道"] else 0,
        "Where": 1 if details.get("Where") not in ["不明", "不知道"] else 0,
        "When": 1 if details.get("When") not in ["不明", "不知道"] else 0,
        "Why": 1 if details.get("Why") not in ["不明", "不知道"] else 0,
        "How": 1 if details.get("How") not in ["不明", "不知道"] else 0
    }
    
    # Calculate the total score
    total_score = sum(evaluation.values())
    
    return evaluation, total_score

# Provided input string
input_str = output

# Parse the input string into a dictionary
details = parse_input(input_str)

# Evaluate the parsed details
evaluation, total_score = evaluate_details(details)

whscore = (total_score/6)*100
print(f"Evaluation: {evaluation}")
print(f"Total Score: {total_score}")
print()
print(f"5W1H Score: {whscore}")

Skipping malformed line: .
Skipping malformed line: 
Evaluation: {'Who': 1, 'What': 1, 'Where': 1, 'When': 1, 'Why': 1, 'How': 1}
Total Score: 6

5W1H Score: 100.0


Content Relevance

In [16]:
# Install necessary packages
#!pip install jieba transformers sentence-transformers

In [17]:
import jieba
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Initialize jieba for Chinese segmentation
def segment(text):
    return ' '.join(jieba.cut(text))

# Sample Chinese sentences
# let sentence1 be transcribed text from student and sentence2 be reference answer

reference_text = """图片中 是 家里 的 饭厅。现在 应该 是 早餐时间。爸爸、妈妈、姐姐 和 弟弟 一家人 正在 用 早餐。
爸爸 的 早餐 是 粥/面汤，妈妈 的 是 面包、因为 他 边 倒 边 看 平板电脑。妈妈 看了 很生气。  
姐姐 帮 弟弟 抹/擦掉 倒/溢在 桌子上 的 牛奶/鲜奶/果汁。爸爸 看到了， 
竖起 大拇指 称赞/夸奖 姐姐"""

sentence2 = """
图片中 是 家里 的 饭厅。现在 应该 是 早餐时间。爸爸、妈妈、姐姐 和 弟弟 一家人 正在 用 早餐。
爸爸 的 早餐 是 粥/面汤，妈妈 的 是 面包、煎蛋 和 咖啡/茶，姐姐 和 弟弟 的 是 麦片 和 牛奶/鲜奶/果汁。 
弟弟 倒 牛奶/果汁时，溢/倒  出来 了，因为 他 边 倒 边 看 平板电脑。妈妈 看了 很生气。  
姐姐 帮 弟弟 抹/擦掉 倒/溢在 桌子上 的 牛奶/鲜奶/果汁。爸爸 看到了， 
竖起 大拇指 称赞/夸奖 姐姐。有一次，我弟弟  吃东西时 不小心 打翻 了 食物， 
我 有 帮他 清理/抹/擦 桌子。  我 认为 弟弟、妹妹 不小心 做错事时， 作为 哥哥、姐姐的，应该 帮 他们。
"""

# Segment the sentences
segmented_sentence1 = segment(text)
segmented_sentence2 = segment(reference_text)

# Load a pre-trained Chinese model from Sentence Transformers
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Encode the segmented sentences to get their embeddings
embedding1 = model.encode(segmented_sentence1)
embedding2 = model.encode(segmented_sentence2)

# Calculate cosine similarity between the two embeddings
similarity = cosine_similarity([embedding1], [embedding2])[0][0]
similarity_score = similarity * 100
print(f"Semantic Similarity: {similarity:.4f}")
print()
print(f"Semantic Similarity Score: {similarity_score:.2f}")



Semantic Similarity: 0.4107

Semantic Similarity Score: 41.07


Grammar

In [18]:
text1 = "昨天在公园里，一位年轻孩子在草地上野餐，因为天气很好，他们餐布铺在地上，吃着美味的食物。"

In [19]:
model = GPT4All("mistral-7b-instruct-v0.1.Q4_0.gguf")
grammar_prompt = (
    text1 + "Based on the sentence, evaluate the grammar by giving a score out of 10"
)

# Generate response with specific parameters for consistency
#temp=0 reduce randomness
grammar_output = model.generate(grammar_prompt, temp=0)
print(grammar_output)

.
A: 9


In [20]:
import re
score_match = re.search(r"\d+", grammar_output)
if score_match:
    score = int(score_match.group())
    print(f"Score: {score}")
else:
    print("No score found in the output.")

Score: 9


Fluency

In [21]:
#provided text
text_file = "data/reading-passage.txt"
recording_file = "data/recordings/chinese/chinese_b2/0dc73844-8d4f-2b00-75f6-c6bc3d267377Text_002_Line_1.wav"
model = '0'
lang = 'chinese'

In [22]:
from joblib import load
import modules.prepare_data as prepare_data
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import keras
import librosa

with open(text_file, 'r') as file:
        provided_text = file.read()

print(provided_text)

#prepare the new audio and extract features
audio_array, sampling_rate = librosa.load(recording_file, sr=None)
audio_data = {'array': audio_array, 'sampling_rate': sampling_rate}

print(audio_data['array'])
print(audio_data['sampling_rate'])

data = prepare_data.load_audio(lang,provided_text,audio_data)

#use previous scaler to scale the new prediction to fit into the model
data = pd.DataFrame([data])
data['mfcc'] = data['mfcc'].apply(lambda x: x.flatten())
mfcc_length = data['mfcc'].apply(len).max()
data['mfcc'] = data['mfcc'].apply(lambda x: np.pad(x, (0, mfcc_length - len(x)), mode='constant'))

# Convert mfcc column into multiple columns
mfcc_features = np.stack(data['mfcc'].values)
df_mfcc = pd.DataFrame(mfcc_features, index=data.index)
X = pd.concat([data[['speech_rate', 'pause_rate', 'pronunciation_accuracy']], df_mfcc], axis=1)
X.columns = X.columns.astype(str)

#Load scalar
scaler = StandardScaler()
X_train = pd.read_pickle("data/pickles/"+lang+"_X_train.pkl")
scaler.fit(X_train)

#Normalise new data
new_data_scaled = scaler.transform(X)

# Load the model from the file
# 0 for XGBoost, 1 for Random Forest
print(model)
if model == '0':
    loaded_model = keras.models.load_model('models/model_'+lang+'.keras')
elif model == '1':
    loaded_model = load('models/random_forest_model.joblib')
else:
    exit

y_pred = loaded_model.predict(new_data_scaled)
y_pred_class = np.argmax(y_pred, axis=1)
fluency_score = int((y_pred_class[0]/4)*100)
print("Fluency Score: " + str(fluency_score))

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.

树枝上有一个小鸟窝


  audio_array, sampling_rate = librosa.load(recording_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'data/recordings/chinese/chinese_b2/0dc73844-8d4f-2b00-75f6-c6bc3d267377Text_002_Line_1.wav'

Compiled

In [None]:
print(f"Vocabulary Richness: {vocabulary_richness_percentage:.2f}")
print(f"5W1H: {whscore}")
print(f"Content Relevance: {similarity_score:.2f}")
print(f"Grammar: {score}")
print(f"Fluency: {fluency_score}")

Vocabulary Richness: 79.62
5W1H: 100.0


NameError: name 'similarity_score' is not defined