This notebook is used to analyze the text features from the t-scan model.
The output of the anaysis is a machine learning model that predicts the AVI score of a text based on the text features.

In [1649]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from scipy import stats
from sklearn.linear_model import LinearRegression


In [1650]:
# Load the data into a pandas dataframe
data = pd.read_csv('14-06-2024.csv', index_col=False)
data.head()

Unnamed: 0,Inputfile,Par_per_doc,Zin_per_doc,Word_per_doc,Alpino_status,wrd_freq_log_zn_corr,wrd_freq_zn_log,Conc_nw_ruim_p,Conc_nw_strikt_p,Alg_nw_d,...,Log_prob_bwd_inhwrd_zn,Entropie_bwd,Entropie_bwd_norm,Perplexiteit_bwd,Perplexiteit_bwd_norm,Eigen_classificatie,LiNT_score1,LiNT_niveau1,LiNT_score2,LiNT_niveau2
0,input/AVI_4_vijfer.txt,1,8,105,0,4.85383,4.76046,0.8,0.2,0.0,...,,,,,,,26.1652,1,27.974,1
1,input/AVI_4_jongen.txt,1,4,107,0,4.87906,4.8056,0.894737,0.789474,0.0,...,,,,,,,33.3231,1,32.6028,1
2,input/AVI_8_frans.txt,1,13,138,0,4.62739,4.61028,0.764706,0.558824,0.0,...,,,,,,,35.1191,2,33.8752,1
3,input/AVI_7_cijfer.txt,1,10,93,0,4.16929,4.04726,0.84,0.72,0.0,...,,,,,,,43.4656,2,41.3559,2
4,input/AVI_1_beer.txt,1,9,72,0,5.22533,5.22533,0.692308,0.692308,0.0,...,,,,,,,16.1034,1,12.5536,1


In [1651]:
# Place the AVI score in a separate variable
# Currently it is in the title column and we need to extract it
first_column = data['Inputfile']
# Extract AVI scores from strings
avi_scores = [s.split('_')[1] for s in first_column]

# Add the AVI scores to the dataframe
data.insert(1, 'AVI', avi_scores)
# Make the avi the second column

In [1652]:
# Save the data to a new csv file
data.to_csv('results_with_avi.csv', index=False)

In [1653]:
# Define test and train data
X = data.drop(columns=['AVI', 'Inputfile'])
y = data['AVI']
y = y.astype(int)

In [1654]:
# Keep these columns and drop the rest
# columns_to_keep = ['Wrd_per_zin','Pers_vnw_d', "AL_max", "Let_per_wrd_corr"]
# X = X[columns_to_keep]

In [1655]:
X.head

<bound method NDFrame.head of      Par_per_doc  Zin_per_doc  Word_per_doc  Alpino_status  \
0              1            8           105              0   
1              1            4           107              0   
2              1           13           138              0   
3              1           10            93              0   
4              1            9            72              0   
..           ...          ...           ...            ...   
182            1           10            84              0   
183            1            5            23              0   
184            1           28           152              0   
185            1           11            95              0   
186            1           13           133              0   

     wrd_freq_log_zn_corr  wrd_freq_zn_log  Conc_nw_ruim_p  Conc_nw_strikt_p  \
0                 4.85383          4.76046        0.800000          0.200000   
1                 4.87906          4.80560        0.894737       

In [1656]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1657]:
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)

In [1658]:

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.2894736842105263
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         3
           3       0.28      0.62      0.38         8
           4       0.00      0.00      0.00         1
           5       0.33      0.33      0.33         3
           6       0.50      0.20      0.29         5
           7       0.50      0.40      0.44         5
           8       0.50      0.20      0.29         5
           9       0.33      1.00      0.50         1

    accuracy                           0.29        38
   macro avg       0.27      0.31      0.25        38
weighted avg       0.29      0.29      0.25        38



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1659]:
feature_importances = model.feature_importances_
feature_names = X.columns

feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

             Feature  Importance
106     Dzin_per_wrd    0.167384
417             Ww_d    0.045447
128            Nom_d    0.042261
333  Gelabeld_bvnw_p    0.034303
331      Subj_bvnw_d    0.022371
..               ...         ...
250    Concr_ov_nw_d         NaN
290     Kleur_bvnw_p         NaN
338       Abstr_ww_d         NaN
397        Vd_vrij_d         NaN
412           Lidw_d         NaN

[472 rows x 2 columns]


In [1660]:

# Step 1: Fit a Linear Regression model
model = LinearRegression()
X.dropna(axis=1, inplace=True)
model.fit(X, y)

# Step 2: Predict continuous values
y_pred_continuous = model.predict(X)

# Map continuous predictions to nearest integer
y_pred_class = np.round(y_pred_continuous).astype(int)

# Ensure predictions are within the range of 1 to 9
y_pred_class = np.clip(y_pred_class, 1, 9)


# Generate classification report
print("Classification Report:")
print(classification_report(y, y_pred_class))

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        22
           2       1.00      1.00      1.00        11
           3       1.00      1.00      1.00        48
           4       1.00      1.00      1.00        16
           5       1.00      0.95      0.97        20
           6       1.00      1.00      1.00        19
           7       0.91      1.00      0.95        21
           8       1.00      0.95      0.98        22
           9       0.88      0.88      0.88         8

    accuracy                           0.98       187
   macro avg       0.98      0.98      0.98       187
weighted avg       0.98      0.98      0.98       187



In [1662]:
# Step 1: Fit a Linear Regression model
model = LinearRegression()
X.dropna(axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Step 2: Predict continuous values
y_pred_continuous = model.predict(X_test)

# Map continuous predictions to nearest integer
y_pred_class = np.round(y_pred_continuous).astype(int)

# Ensure predictions are within the range of 1 to 9
y_pred_class = np.clip(y_pred_class, 1, 9)


# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_class))


Classification Report:
              precision    recall  f1-score   support

           1       0.33      0.57      0.42         7
           2       0.50      0.33      0.40         3
           3       0.50      0.12      0.20         8
           4       0.00      0.00      0.00         1
           5       0.33      0.33      0.33         3
           6       0.00      0.00      0.00         5
           7       0.33      0.20      0.25         5
           8       0.00      0.00      0.00         5
           9       0.08      1.00      0.14         1

    accuracy                           0.24        38
   macro avg       0.23      0.28      0.19        38
weighted avg       0.28      0.24      0.21        38



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
