In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

! pip install kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023

from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

import pandas as pd
import numpy as np

Mounted at /content/drive
Downloading detecting-french-texts-difficulty-level-2023.zip to /content
100% 303k/303k [00:00<00:00, 891kB/s]
100% 303k/303k [00:00<00:00, 890kB/s]


In [8]:
df = pd.read_csv('training_data.csv', index_col = 'id')
df.head()

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1


In [9]:
df_pred = pd.read_csv('unlabelled_test_data.csv', index_col = 'id')
df_pred.head()

Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,Nous dûmes nous excuser des propos que nous eû...
1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,"Et, paradoxalement, boire froid n'est pas la b..."
3,"Ce n'est pas étonnant, car c'est une saison my..."
4,"Le corps de Golo lui-même, d'une essence aussi..."


In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline


from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


### Calcul de difficulté du texte

In [11]:
# Is the character a vowel or not
def isVowel(ch):
    return (ch == 'a' or ch == 'e' or
            ch == 'i' or ch == 'o' or
            ch == 'u')

# Function to calculate difficulty
def calcDiff(str):
    str = str.lower()
    count_vowels = 0
    count_conso = 0
    consec_conso = 0
    hard_words = 0
    easy_words = 0

    # Start traversing the string
    for i in range(0, len(str)):

        # Check if current character is vowel or consonant
        if(str[i]!= " " and isVowel(str[i])):
            # Increment
            count_vowels += 1
            consec_conso = 0
        elif(str[i] != " "):
            count_conso += 1
            consec_conso += 1
        if(consec_conso == 4): #hard word
            hard_words += 1

            while(i < len(str) and str[i] != " "):
                i += 1
            count_conso = 0
            count_vowels = 0
            consec_conso = 0

        elif(i < len(str) and (str[i] == ' ' or
                          i == len(str) - 1)):
            if(count_conso > count_vowels):
                hard_words += 1
            else:
                easy_words += 1
            count_conso = 0
            count_vowels = 0
            consec_conso = 0

    return (5 * hard_words + 3 * easy_words) #basic function to calculate the difficuly



In [16]:
df['calcul'] = df['sentence'].apply(calcDiff)

display(df)

Unnamed: 0_level_0,sentence,difficulty,calcul
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1,165
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,51
2,Le test de niveau en français est sur le site ...,A1,54
3,Est-ce que ton mari est aussi de Boston?,A1,35
4,"Dans les écoles de commerce, dans les couloirs...",B1,151
...,...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2,118
4796,Il avait une de ces pâleurs splendides qui don...,C1,85
4797,"Et le premier samedi de chaque mois, venez ren...",A2,58
4798,Les coûts liés à la journalisation n'étant pas...,C2,170


In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Calcul médiane
median_by_difficulty = df.groupby('difficulty')['calcul'].median()
print(median_by_difficulty)

# Calcul q1
q1_by_difficulty = df.groupby('difficulty')['calcul'].quantile(0.25)
print(q1_by_difficulty)

# Calcul q3
q3_by_difficulty = df.groupby('difficulty')['calcul'].quantile(0.75)
print(q3_by_difficulty)

difficulty
A1     26.0
A2     42.0
B1     56.0
B2     75.0
C1    102.0
C2    111.0
Name: calcul, dtype: float64
difficulty
A1    18.00
A2    31.00
B1    42.00
B2    52.75
C1    68.00
C2    71.00
Name: calcul, dtype: float64
difficulty
A1     38.00
A2     61.00
B1     79.00
B2    110.25
C1    140.75
C2    159.00
Name: calcul, dtype: float64


### Essai sur les données

In [26]:
df['calcul'] = df['sentence'].apply(calcDiff)

df['pred'] = df.apply(lambda row: 'A1' if row['calcul'] < 26 else ('A2' if 26 <= row['calcul'] < 42 else ('B1' if 42 <= row['calcul'] < 56 else ('B2' if 56 <= row['calcul'] < 75 else ('C1' if 75 <= row['calcul'] < 111 else 'C2')))), axis=1)

actual_labels = df['difficulty']
predicted_labels = df['pred']

# calcul of caracteristics
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')
accuracy = accuracy_score(actual_labels, predicted_labels)

# Print the results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')

Precision: 0.3547
Recall: 0.3435
F1 Score: 0.3454
Accuracy: 0.3435


In [27]:
X = df_pred['sentence']
df_pred['calcul'] = df_pred['sentence'].apply(calcDiff)

df_pred['difficulty'] = ''
df_pred['difficulty'] = df_pred.apply(lambda row: 'A1' if row['calcul'] < 26 else ('A2' if 26 <= row['calcul'] < 42 else ('B1' if 42 <= row['calcul'] < 56 else ('B2' if 56 <= row['calcul'] < 75 else ('C1' if 75 <= row['calcul'] < 111 else 'C2')))), axis=1)


to_delete = ['sentence', 'calcul']
rendu = df_pred.drop(to_delete, axis=1)
rendu.to_csv('submission.csv')
rendu

Unnamed: 0_level_0,difficulty
id,Unnamed: 1_level_1
0,B1
1,B1
2,A2
3,B1
4,C2
...,...
1195,C1
1196,B1
1197,C2
1198,A2


In [None]:
! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "UNIL_Rolex"

100% 8.30k/8.30k [00:00<00:00, 10.1kB/s]
Successfully submitted to Detecting the difficulty level of French texts