# Guidelines

In [None]:
# reading in the data via the Kaggle API

# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# install Kaggle
! pip install kaggle



In [None]:
!mkdir ~/.kaggle

In [None]:

!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023

Downloading detecting-french-texts-difficulty-level-2023.zip to /content
  0% 0.00/303k [00:00<?, ?B/s]
100% 303k/303k [00:00<00:00, 84.0MB/s]


In [None]:


from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('training_data.csv', index_col = 'id')

In [None]:
df.head()

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1


In [None]:
df_pred = pd.read_csv('unlabelled_test_data.csv', index_col = 'id')
df_pred.head()

Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,Nous dûmes nous excuser des propos que nous eû...
1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,"Et, paradoxalement, boire froid n'est pas la b..."
3,"Ce n'est pas étonnant, car c'est une saison my..."
4,"Le corps de Golo lui-même, d'une essence aussi..."


In [None]:
df_example_submission = pd.read_csv('sample_submission.csv', index_col = 'id')
df_example_submission.head()

Unnamed: 0_level_0,difficulty
id,Unnamed: 1_level_1
0,A1
1,A1
2,A1
3,A1
4,A1


# Maintenant on code

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(sublinear_tf=True, max_features=5000)

X_train_tfidf = vectorizer.fit_transform(df_train['sentence'])
X_test_tfidf = vectorizer.transform(df_test['sentence'])

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(df_train['difficulty'])
y_test_encoded = label_encoder.transform(df_test['difficulty'])


best_accuracy = 0
best_neighbors = 0

for neighbors in range(1, 11):
    model_knn = KNeighborsClassifier(n_neighbors=neighbors)
    model_knn.fit(X_train_tfidf, y_train_encoded)
    predictions_knn = model_knn.predict(X_test_tfidf)
    accuracy_knn = accuracy_score(y_test_encoded, predictions_knn)

    print(f"Accuracy (k-NN with {neighbors} neighbors): {accuracy_knn}")

    if accuracy_knn > best_accuracy:
        best_accuracy = accuracy_knn
        best_neighbors = neighbors

print(f"\nBest number of neighbors: {best_neighbors} with accuracy: {best_accuracy}")


Accuracy (k-NN with 1 neighbors): 0.265625
Accuracy (k-NN with 2 neighbors): 0.35833333333333334
Accuracy (k-NN with 3 neighbors): 0.31145833333333334
Accuracy (k-NN with 4 neighbors): 0.325
Accuracy (k-NN with 5 neighbors): 0.3489583333333333
Accuracy (k-NN with 6 neighbors): 0.33125
Accuracy (k-NN with 7 neighbors): 0.3145833333333333
Accuracy (k-NN with 8 neighbors): 0.321875
Accuracy (k-NN with 9 neighbors): 0.32708333333333334
Accuracy (k-NN with 10 neighbors): 0.3385416666666667

Best number of neighbors: 2 with accuracy: 0.35833333333333334


In [None]:
X_pred_tfidf = vectorizer.transform(df_pred['sentence'])


model_knn_final = KNeighborsClassifier(n_neighbors=4)
model_knn_final.fit(X_train_tfidf, y_train_encoded)

predictions_knn_final = model_knn_final.predict(X_pred_tfidf)

predicted_difficulties = label_encoder.inverse_transform(predictions_knn_final)

df_pred['difficulty'] = predicted_difficulties

print(df_pred[['sentence', 'difficulty']])


                                               sentence difficulty
id                                                                
0     Nous dûmes nous excuser des propos que nous eû...         B1
1     Vous ne pouvez pas savoir le plaisir que j'ai ...         A2
2     Et, paradoxalement, boire froid n'est pas la b...         A1
3     Ce n'est pas étonnant, car c'est une saison my...         A1
4     Le corps de Golo lui-même, d'une essence aussi...         C1
...                                                 ...        ...
1195  C'est un phénomène qui trouve une accélération...         A1
1196  Je vais parler au serveur et voir si on peut d...         A1
1197  Il n'était pas comme tant de gens qui par pare...         C1
1198      Ils deviennent dangereux pour notre économie.         C1
1199  Son succès a généré beaucoup de réactions néga...         C1

[1200 rows x 2 columns]


In [None]:
df_pred.drop('sentence',  axis=1, inplace=True)

df_pred

Unnamed: 0_level_0,difficulty
id,Unnamed: 1_level_1
0,B1
1,A2
2,A1
3,A1
4,C1
...,...
1195,A1
1196,A1
1197,C1
1198,C1


In [None]:
df_pred.to_csv('submission.csv')

! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "UNIL_Rolex"

100% 8.30k/8.30k [00:00<00:00, 31.7kB/s]
Successfully submitted to Detecting the difficulty level of French texts