<a href="https://colab.research.google.com/github/alexandrastna/French-text-using-AI/blob/main/model_experiments_minimal_preprocessing_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict the difficulty of French text using AI

Initial models with minimal data preprocessing and no hyper-parameter tuning.

# Data processing ###

In [18]:
#Import necessary libraries

import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
#Load the data

df_training_data = pd.read_csv('https://raw.githubusercontent.com/alexandrastna/French-text-using-AI/main/training_data.csv')

df_sample_submission = pd.read_csv('https://raw.githubusercontent.com/alexandrastna/French-text-using-AI/main/sample_submission.csv')

df_unlabelled_test_data = pd.read_csv('https://raw.githubusercontent.com/alexandrastna/French-text-using-AI/main/unlabelled_test_data.csv')

Data exploration:

In [3]:
df_training_data.head()
df_training_data.describe()
df_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          4800 non-null   int64 
 1   sentence    4800 non-null   object
 2   difficulty  4800 non-null   object
dtypes: int64(1), object(2)
memory usage: 112.6+ KB


Data Cleaning:

In [4]:
# Check for missing values
print(df_training_data.isnull().sum())

# If any, decide on the strategy; for example, dropping:
df_training_data.dropna(inplace=True)

id            0
sentence      0
difficulty    0
dtype: int64


Data pre-processing:

In [9]:
# Encode the labels
label_encoder = LabelEncoder()
df_training_data['difficulty_encoded'] = label_encoder.fit_transform(df_training_data['difficulty'])

# Separate features and labels
X = df_training_data['sentence']
y = df_training_data['difficulty_encoded']

# Vectorize the sentences
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)



## Model Training ###

### Logistic Regression ###

In [23]:

# Initialize the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence if necessary

# Train the model
logreg.fit(X_train, y_train)

# Validate the model
y_pred_val_log = logreg.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred_val_log)
precision = precision_score(y_val, y_pred_val_log, average='weighted')  # 'weighted' accounts for label imbalance
recall = recall_score(y_val, y_pred_val_log, average='weighted')
f1 = f1_score(y_val, y_pred_val_log, average='weighted')

# Print the metrics
print(f"Logistic Regression Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

# Print the classification report and confusion matrix
print("Classification Report:\n", classification_report(y_val, y_pred_val_log))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val_log))

Logistic Regression Accuracy: 0.44895833333333335
Precision: 0.4409901928071059
Recall: 0.44895833333333335
F1-Score: 0.4400315184878664
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.64      0.56       166
           1       0.34      0.30      0.32       158
           2       0.41      0.27      0.33       166
           3       0.44      0.41      0.43       153
           4       0.45      0.48      0.47       152
           5       0.51      0.58      0.54       165

    accuracy                           0.45       960
   macro avg       0.44      0.45      0.44       960
weighted avg       0.44      0.45      0.44       960

Confusion Matrix:
 [[107  32  12  10   3   2]
 [ 55  48  27   9   9  10]
 [ 34  50  45  10   7  20]
 [  9   5  10  63  39  27]
 [  8   3   7  27  73  34]
 [  5   3   9  23  30  95]]


### Support Vector Machines (SVM) ###

In [24]:
from sklearn.svm import SVC

# Initialize the SVM model
svm = SVC(kernel='linear')  # The kernel can be 'linear', 'poly', 'rbf', 'sigmoid', etc.

# Train the model
svm.fit(X_train, y_train)

# Validate the model
y_pred_val_svm = svm.predict(X_val)

# Calculate metrics
accuracy_svm = accuracy_score(y_val, y_pred_val_svm)
precision_svm = precision_score(y_val, y_pred_val_svm, average='weighted')  # 'weighted' accounts for label imbalance
recall_svm = recall_score(y_val, y_pred_val_svm, average='weighted')
f1_svm = f1_score(y_val, y_pred_val_svm, average='weighted')

# Print the metrics
print(f"SVM Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1-Score: {f1_svm}")

# Print the classification report and confusion matrix
print("Classification Report:\n", classification_report(y_val, y_pred_val_svm))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val_svm))


SVM Accuracy: 0.45416666666666666
Precision: 0.4494598616869399
Recall: 0.45416666666666666
F1-Score: 0.44985008847754787
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.64      0.58       166
           1       0.34      0.35      0.35       158
           2       0.40      0.31      0.35       166
           3       0.44      0.43      0.44       153
           4       0.44      0.45      0.44       152
           5       0.54      0.55      0.54       165

    accuracy                           0.45       960
   macro avg       0.45      0.45      0.45       960
weighted avg       0.45      0.45      0.45       960

Confusion Matrix:
 [[106  38   9   7   6   0]
 [ 51  55  37   7   5   3]
 [ 28  55  51  11   6  15]
 [  7   6  12  66  38  24]
 [  6   3  11  30  68  34]
 [  3   3   8  28  33  90]]


### Random Forests ###

In [25]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Validate the model
y_pred_val_rf = rf.predict(X_val)

# Calculate metrics
accuracy_rf = accuracy_score(y_val, y_pred_val_rf)
precision_rf = precision_score(y_val, y_pred_val_rf, average='weighted')  # 'weighted' accounts for label imbalance
recall_rf = recall_score(y_val, y_pred_val_rf, average='weighted')
f1_rf = f1_score(y_val, y_pred_val_rf, average='weighted')

# Print the metrics
print(f"Random Forest Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1-Score: {f1_rf}")

# Print the classification report and confusion matrix
print("Classification Report:\n", classification_report(y_val, y_pred_val_rf))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val_rf))

Random Forest Accuracy: 0.40520833333333334
Precision: 0.4019891511153694
Recall: 0.40520833333333334
F1-Score: 0.3903587582946592
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.78      0.57       166
           1       0.33      0.27      0.30       158
           2       0.36      0.23      0.29       166
           3       0.37      0.39      0.38       153
           4       0.37      0.38      0.37       152
           5       0.53      0.37      0.44       165

    accuracy                           0.41       960
   macro avg       0.40      0.40      0.39       960
weighted avg       0.40      0.41      0.39       960

Confusion Matrix:
 [[130  18   7   8   2   1]
 [ 77  43  26   5   4   3]
 [ 45  46  39  15  10  11]
 [ 15   8  18  59  39  14]
 [ 11   5   8  46  57  25]
 [ 12  11   9  28  44  61]]


### Decision Trees ###

In [26]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Train the model
dt.fit(X_train, y_train)

# Validate the model
y_pred_val_dt = dt.predict(X_val)

# Calculate metrics
accuracy_dt = accuracy_score(y_val, y_pred_val_dt)
precision_dt = precision_score(y_val, y_pred_val_dt, average='weighted')  # 'weighted' accounts for label imbalance
recall_dt = recall_score(y_val, y_pred_val_dt, average='weighted')
f1_dt = f1_score(y_val, y_pred_val_dt, average='weighted')

# Print the metrics
print(f"Decision Tree Accuracy: {accuracy_dt}")
print(f"Precision: {precision_dt}")
print(f"Recall: {recall_dt}")
print(f"F1-Score: {f1_dt}")

# Print the classification report and confusion matrix
print("Classification Report:\n", classification_report(y_val, y_pred_val_dt))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val_dt))

Decision Tree Accuracy: 0.3260416666666667
Precision: 0.32244918645079756
Recall: 0.3260416666666667
F1-Score: 0.3229336657109519
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.55      0.52       166
           1       0.31      0.32      0.31       158
           2       0.30      0.23      0.26       166
           3       0.26      0.31      0.28       153
           4       0.24      0.22      0.23       152
           5       0.32      0.31      0.31       165

    accuracy                           0.33       960
   macro avg       0.32      0.32      0.32       960
weighted avg       0.32      0.33      0.32       960

Confusion Matrix:
 [[92 43  9 12  5  5]
 [44 50 32 14  6 12]
 [28 34 39 27 17 21]
 [11 12 14 47 39 30]
 [ 6  8 18 44 34 42]
 [ 5 13 19 37 40 51]]


### K-Nearest Neighbors (KNN) ###

In [27]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Validate the model
y_pred_val_knn = knn.predict(X_val)

# Calculate metrics
accuracy_knn = accuracy_score(y_val, y_pred_val_knn)
precision_knn = precision_score(y_val, y_pred_val_knn, average='weighted')  # 'weighted' accounts for label imbalance
recall_knn = recall_score(y_val, y_pred_val_knn, average='weighted')
f1_knn = f1_score(y_val, y_pred_val_knn, average='weighted')

# Print the metrics
print(f"KNN Accuracy: {accuracy_knn}")
print(f"Precision: {precision_knn}")
print(f"Recall: {recall_knn}")
print(f"F1-Score: {f1_knn}")

# Print the classification report and confusion matrix
print("Classification Report:\n", classification_report(y_val, y_pred_val_knn))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val_knn))

KNN Accuracy: 0.31979166666666664
Precision: 0.39497806995803963
Recall: 0.31979166666666664
F1-Score: 0.2955634512384226
Classification Report:
               precision    recall  f1-score   support

           0       0.30      0.85      0.44       166
           1       0.21      0.26      0.23       158
           2       0.21      0.13      0.16       166
           3       0.47      0.23      0.31       153
           4       0.57      0.17      0.26       152
           5       0.63      0.25      0.36       165

    accuracy                           0.32       960
   macro avg       0.40      0.32      0.29       960
weighted avg       0.39      0.32      0.30       960

Confusion Matrix:
 [[141  18   5   0   0   2]
 [ 99  41  16   1   0   1]
 [ 87  52  22   3   1   1]
 [ 49  32  22  35  11   4]
 [ 46  25  18  20  26  17]
 [ 53  24  23  15   8  42]]
