In [166]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from xgboost import XGBClassifier,XGBRegressor
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report, cohen_kappa_score,mean_squared_error
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
import pickle
import numpy as np
import matplotlib.pyplot as plt
import textstat
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from src.features import *
import pickle

In [46]:
def DATAFRAME_SCALER(df, remove_useless_columns=True, range_of_scaling=(-1, 1), display_distribution=False):
    """
    Scale specific columns in the dataframe based on conditions and optionally remove selected columns.

    Args:
        df (DataFrame): The input dataframe to process.
        remove_useless_columns (bool, optional): Flag to remove specified columns. Defaults to True.
        range_of_scaling (tuple, optional): The range used for scaling. Defaults to (-1,1).
        display_distribution (bool, optional): Flag to display distribution of scaled scores. Defaults to False.

    Returns:
        DataFrame: The processed dataframe with scaled scores and optionally removed columns.
    """
    # Dictionary to store scalers
    scalers_dict = {}

    # --- DOMAIN 1 HANDLING ---
    scaled_domain1_score_list = []
    for i in np.arange(8):
        df_temp = df[df["essay_set"] == i+1].copy()  # parsing essay set
        domain_score = np.array(df_temp["domain1_score"]).reshape(-1, 1)  # turning score column in array
        scaler = MinMaxScaler(feature_range=range_of_scaling)  # scaler
        scaler.fit(domain_score)  # fitting ...
        df_temp["scaled_domain1_score"] = scaler.transform(domain_score)  # scaling the score column
        scaled_domain1_score_list.append(df_temp[["scaled_domain1_score"]])  # add the scaled column to lists
        
        # Save the scaler with essay_set as the key
        scalers_dict[f"domain1_essay_set_{i+1}"] = scaler

    scaled_scores_df = pd.concat(scaled_domain1_score_list)
    df = df.join(scaled_scores_df)

    # --- DOMAIN 2 HANDLING ---
    scaler2 = MinMaxScaler(feature_range=range_of_scaling)
    x = np.array(df["domain2_score"]).reshape(-1, 1)
    scaler2.fit(x)
    df["scaled_domain2_score"] = scaler2.transform(x)

    # Save the scaler for domain2
    scalers_dict["domain2"] = scaler2

    if remove_useless_columns:
        df = df.drop(['rater1_domain1', 'rater2_domain1', 'rater3_domain1', 
                      'rater1_domain2', 'rater2_domain2', 'rater1_trait1',       
                      'rater1_trait2', 'rater1_trait3', 'rater1_trait4', 
                      'rater1_trait5', 'rater1_trait6', 'rater2_trait1', 
                      'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 
                      'rater2_trait5', 'rater2_trait6', 'rater3_trait1', 
                      'rater3_trait2', 'rater3_trait3', 'rater3_trait4', 
                      'rater3_trait5', 'rater3_trait6'], axis=1)

    if display_distribution:
        plt.figure(figsize=(20, 6))
        plt.suptitle("With parsed scaling")

        plt.subplot(1, 2, 1)
        plt.hist(df['scaled_domain1_score'], edgecolor='black')
        plt.title('domain1_score')
        plt.xlabel('value')
        plt.ylabel('freq')

        plt.subplot(1, 2, 2)
        plt.hist(df['scaled_domain2_score'], edgecolor='black')
        plt.title('domain2_score')
        plt.xlabel('value')
        plt.ylabel('freq')

        plt.show()

    # Return the processed dataframe and the dictionary of scalers
    return df, scalers_dict

def FEATURE_ENGINEERING(df):
    df["count_characters"] = df["essay"].apply(lambda x: count_characters(x))
    df["count_syllables"] = df["essay"].apply(lambda x: count_syllables(x))
    df["count_words"] = df["essay"].apply(lambda x: count_words(x))
    df["count_sentences"] = df["essay"].apply(lambda x: count_sentences(x))
    df["flesch_reading_ease"] = df["essay"].apply(lambda x: get_flesch_reading_ease(x))
    df["gunning_fog"] = df["essay"].apply(lambda x: get_gunning_fog(x))
    df["automated_readability_index"] = df["essay"].apply(lambda x: get_automated_readability_index(x))
    df["smog_index"] = df["essay"].apply(lambda x: get_smog_index(x))
    df["flesch_kincaid_grade"] = df["essay"].apply(lambda x: get_flesch_kincaid_grade(x))
    df["coleman_liau_index"] = df["essay"].apply(lambda x: get_coleman_liau_index(x))
    df["dale_chall_readability_score"] = df["essay"].apply(lambda x: get_dale_chall_readability_score(x))
    df["automated_readability_index"] = df["essay"].apply(lambda x: get_automated_readability_index(x))
    df["dale_chall_readability_score"] = df["essay"].apply(lambda x: get_dale_chall_readability_score(x))
    df["difficult_words"] = df["essay"].apply(lambda x: get_difficult_words(x))
    df["linsear_write_formula"] = df["essay"].apply(lambda x: get_linsear_write_formula(x))
    df["count_awl_words"] = df["essay"].apply(lambda x: count_awl_words(x))
    df["calculate_lexical_diversity"] = df["essay"].apply(lambda x: calculate_lexical_diversity(x))
    df["get_average_heights"] = df["essay"].apply(lambda x: get_average_heights(x))
    df["get_average_connections_at_root"] = df["essay"].apply(lambda x: get_average_connections_at_root(x))
    df["get_length_of_clauses"] = df["essay"].apply(lambda x: get_length_of_clauses(x))
    df["calculate_misspelling_score"] = df["essay"].apply(lambda x: calculate_misspelling_score(x))
    df["detect_slur_usage"] = df["essay"].apply(lambda x: detect_slur_usage(x))
    df["calculate_overusage_of_punctuation"] = df["essay"].apply(lambda x: calculate_overusage_of_punctuation(x))
    df["count_tagged_entity"] = df["essay"].apply(lambda x: count_tagged_entity(x))
    df["count_stop_words"] = df["essay"].apply(lambda x: count_stop_words(x))
    df["count_quoted_words"] = df["essay"].apply(lambda x: count_quoted_words(x))

    tmp_df = pd.DataFrame()
    tmp_df = df["essay"].apply(lambda x: get_pos_tags(x))
    tmp_df = pd.json_normalize(tmp_df)
    df = df.join(tmp_df, how="left")

    tmp_df = pd.DataFrame()
    tmp_df = df["essay"].apply(lambda x: get_word_frequency(x))
    tmp_df = pd.json_normalize(tmp_df)
    df = df.join(tmp_df, how="left")

    tmp_df = pd.DataFrame()
    tmp_df = df["essay"].apply(lambda x: get_sentence_tree_roots(x))
    tmp_df = pd.json_normalize(tmp_df)
    tmp_df.fillna(0, inplace=True)
    df = df.join(tmp_df, how="left")
    
    with open("processed_data.pickle", "wb") as file:
        pickle.dump(df, file)
    return df

def SPLIT_AND_BALANCE(df, test_size=100, val_size=100, random_state=42):
    """
    Rééquilibre un DataFrame selon la colonne 'essay_set' et le divise en ensembles de formation, test et validation.
    
    :param df: DataFrame à rééquilibrer et diviser.
    :param test_size: Nombre d'observations dans le test set.
    :param val_size: Nombre d'observations dans le validation set.
    :param random_state: Graine pour la reproductibilité des résultats aléatoires.
    :return: Trois DataFrames équilibrés : train_set, test_set, validation_set.
    """
    # Rééquilibrage
    ros = RandomOverSampler(random_state=random_state)
    X_resampled, y_resampled = ros.fit_resample(df.drop(['essay_set'], axis=1), df['essay_set'])
    resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
    
    # Division en ensembles de test et validation
    X_temp, X_test, y_temp, y_test = train_test_split(resampled_df.drop(['essay_set'], axis=1), resampled_df['essay_set'], 
                                                      test_size=test_size, stratify=resampled_df['essay_set'], random_state=random_state)
    X_train, X_validation, y_train, y_validation = train_test_split(X_temp, y_temp, 
                                                                    test_size=val_size, stratify=y_temp, random_state=random_state)
    
    # Reconstruction des DataFrame pour chaque ensemble
    train_set = pd.concat([X_train, y_train], axis=1)
    test_set = pd.concat([X_test, y_test], axis=1)
    validation_set = pd.concat([X_validation, y_validation], axis=1)
    
    return train_set, test_set, validation_set


In [31]:
# DISCLAIMER : takes a long time to execute, so pass to the next cell instead

training_data = pd.read_excel('data/training_set_rel3.xls')
scaled_df,scalers_dict = DATAFRAME_SCALER(training_data)
featured_df = FEATURE_ENGINEERING(scaled_df)
train_set,test_set,validation_set = SPLIT_AND_BALANCE(featured_df)

In [3]:
training_data = pd.read_excel('data/training_set_rel3.xls')
scaled_df,scalers_dict = DATAFRAME_SCALER(training_data)
target = "scaled_domain1_score"
train_set = pd.read_csv("data/final/train_set.csv").drop(["Unnamed: 0"],axis=1).dropna(subset=[target])
features = train_set.columns[6:]
test_set = pd.read_csv("data/final/test_set.csv").drop(["Unnamed: 0"],axis=1).dropna(subset=[target])
validation_set = pd.read_csv("data/final/validation_set.csv").drop(["Unnamed: 0"],axis=1).dropna(subset=[target])

In [49]:
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Histogram(x=training_data["essay_set"], nbinsx=8, name='essay_set'), row=1, col=1)
fig.add_trace(go.Histogram(x=training_data["domain1_score"], nbinsx=20, name='domain1_score'), row=1, col=2)
fig.update_layout(title_text='raw data distributions')
fig.show()

In [4]:
X_train, y_train = train_set[features], train_set[target]
X_test, y_test = test_set[features], test_set[target]
X_val, y_val = validation_set[features], validation_set[target]

In [6]:
fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Histogram(x=X_train["essay_set"], nbinsx=8, name='train'), row=1, col=1)
fig.add_trace(go.Histogram(x=X_test["essay_set"], nbinsx=8, name='test'), row=1, col=2)
fig.add_trace(go.Histogram(x=X_val["essay_set"], nbinsx=8, name='validation'), row=1, col=3)
fig.update_layout(title_text='essay_set distribution for each dataset')
fig.show()

fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Histogram(x=y_train, nbinsx=20, name='train'), row=1, col=1)
fig.add_trace(go.Histogram(x=y_test, nbinsx=20, name='test'), row=1, col=2)
fig.add_trace(go.Histogram(x=y_val, nbinsx=20, name='validation'), row=1, col=3)
fig.update_layout(title_text='domain1_score (scaled) distribution for each dataset')
fig.show()

# CREATION OF THE CLASSIFICATION MODEL

In [9]:
class_X_train = X_train.drop(["essay_set"],axis=1)
class_y_train = X_train["essay_set"]-1

class_X_test = X_test.drop(["essay_set"],axis=1)
class_y_test = X_test["essay_set"]-1

class_X_val = X_val.drop(["essay_set"],axis=1)
class_y_val = X_val["essay_set"]-1

In [15]:
model = XGBClassifier(objective='multi:softprob', num_class=len(class_y_train.unique()), seed=42)
param_grid = {
    'max_depth': [4, 6],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50, 100],
    'subsample': [0.8, 1],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)
grid_search.fit(class_X_train, class_y_train)

print(f"Best parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=50, subsample=1; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=50, subsample=1; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=50, subsample=1; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   1.5s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   1.4s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.8; total time=   1.6s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=1; total time=   1.5s
[CV] END lear

In [18]:
best_params = grid_search.best_params_
model_optimized = XGBClassifier(**best_params, objective='multi:softprob', num_class=len(class_y_train.unique()), seed=42)
model_optimized.fit(class_X_train, class_y_train)

# prediction on test set
pred_test = model_optimized.predict(class_X_test)
class_report_test = classification_report(class_y_test, pred_test)
print("classification report on test :\n", class_report_test)

# prediction on test validation set
pred_val = model_optimized.predict(class_X_val)
class_report_val = classification_report(class_y_val, pred_val)
print("classification report on validation :\n", class_report_val)

classification report on test :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        12
           2       0.87      1.00      0.93        13
           3       1.00      0.85      0.92        13
           4       0.86      0.92      0.89        13
           5       1.00      1.00      1.00        12
           6       1.00      0.92      0.96        13
           7       1.00      1.00      1.00        12

    accuracy                           0.96       100
   macro avg       0.97      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100

classification report on validation :
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        13
           1       1.00      0.85      0.92        13
           2       0.91      0.83      0.87        12
           3       0.79      0.92      0.85        12
      

In [20]:
with open('models/essay_set_classification_model.pkl', 'wb') as file:
    pickle.dump(model_optimized, file)

# PREDICTION OF DOMAIN1_SCORE

In [22]:
from xgboost import XGBRegressor

param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1]
}

grid_search = GridSearchCV(estimator=XGBRegressor(seed=42), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
grid_search.fit(X_train, y_train)
print(f"Meilleurs paramètres: {grid_search.best_params_}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1; total time=   0.3s
[CV] EN

In [35]:
best_params = grid_search.best_params_
model_optimized = XGBRegressor(**best_params, seed=42)

model_optimized.fit(X_train, y_train)

predictions_test = model_optimized.predict(X_test)
mse_test = mean_squared_error(y_val, predictions_test)
print(f"MSE on test set: {mse_test}")

predictions_val = model_optimized.predict(X_val)
mse_val = mean_squared_error(y_val, predictions_val)
print(f"MSE on validation set: {mse_val}")

MSE on test set: 0.32263248796886346
MSE on validation set: 0.0631256422796088


In [40]:
train_result_df = train_set[["essay_set", "domain1_score", "scaled_domain1_score"]].copy()
test_result_df = test_set[["essay_set", "domain1_score", "scaled_domain1_score"]].copy()
val_result_df = validation_set[["essay_set", "domain1_score", "scaled_domain1_score"]].copy()


val_result_df.loc[:, "pred"] = predictions_val
test_result_df.loc[:, "pred"] = predictions_test

In [160]:
def reverse_scaling(row):
    scaler_key = f'domain1_essay_set_{int(row["essay_set"])}'
    scaler = scalers_dict[scaler_key]
    inversed_pred = scaler.inverse_transform([[row['pred']]])[0][0]

    rounded_pred = np.round(inversed_pred)
    return rounded_pred

val_result_df["reversed_pred"] = val_result_df.apply(reverse_scaling,axis=1)
test_result_df["reversed_pred"] = test_result_df.apply(reverse_scaling,axis=1)

print(f'Test : {cohen_kappa_score(test_result_df["domain1_score"],test_result_df["reversed_pred"], weights="quadratic")}')
print(f'Validation : {cohen_kappa_score(val_result_df["domain1_score"],val_result_df["reversed_pred"], weights="quadratic")}')

Test : 0.9877636680539252
Validation : 0.9920438350098933


In [43]:
with open('models/doman1_score_regression_model.pkl', 'wb') as file:
    pickle.dump(model_optimized, file)

# domain2_score (only for essay_set = 2)

In [175]:
target = "scaled_domain2_score"
X_train_temp_2, y_train_temp_2 = train_set[train_set["essay_set"]==2][features[:-1]], train_set[train_set["essay_set"]==2][target]
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train_temp_2,y_train_temp_2, test_size=0.2, random_state=42)
X_train_2_final, X_val_2, y_train_2_final, y_val_2 = train_test_split(X_train_2,y_train_2, test_size=0.2, random_state=42)

In [176]:
test_result_df_2 = pd.DataFrame(y_test_2).join(train_set["domain2_score"])
val_result_df_2 = pd.DataFrame(y_val_2).join(train_set["domain2_score"])

In [179]:
param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1]
}

grid_search = GridSearchCV(estimator=XGBRegressor(seed=42), param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1)
grid_search.fit(X_train_2_final, y_train_2_final)
print(f"Meilleurs paramètres: {grid_search.best_params_}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}


In [180]:
best_params = grid_search.best_params_
model_optimized = XGBRegressor(**best_params, seed=42)

model_optimized.fit(X_train_2_final, y_train_2_final)

predictions_test = model_optimized.predict(X_test_2)
mse_test = mean_squared_error(y_test_2, predictions_test)
print(f"MSE on test set: {mse_test}")

predictions_val = model_optimized.predict(X_val_2)
mse_val = mean_squared_error(y_val_2, predictions_val)
print(f"MSE on validation set: {mse_val}")

MSE on test set: 0.1155221166102705
MSE on validation set: 0.13775895851855027


In [181]:
test_result_df_2.loc[:, "pred"] = predictions_test
val_result_df_2.loc[:, "pred"] = predictions_val

In [182]:
def reverse_scaling_2(row):
    scaler_key = 'domain2'
    scaler = scalers_dict[scaler_key]
    inversed_pred = scaler.inverse_transform([[row['pred']]])[0][0]
    rounded_pred = np.round(inversed_pred)
    return rounded_pred

val_result_df_2["reversed_pred"] = val_result_df_2.apply(reverse_scaling_2,axis=1)
test_result_df_2["reversed_pred"] = test_result_df_2.apply(reverse_scaling_2,axis=1)

print(f'Test : {cohen_kappa_score(test_result_df_2["domain2_score"],test_result_df_2["reversed_pred"], weights="quadratic")}')
print(f'Validation : {cohen_kappa_score(val_result_df_2["domain2_score"],val_result_df_2["reversed_pred"], weights="quadratic")}')

Test : 0.6312690798081115
Validation : 0.6307863141092374


In [183]:
with open('models/doman2_score_regression_model.pkl', 'wb') as file:
    pickle.dump(model_optimized, file)