In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from tabulate import tabulate
import seaborn as sns
import matplotlib.pyplot as plt


fractional_to_integer_mapping = {
    0.5: 0,
    1.0: 1,
    1.5: 2,
    2.0: 3,
    2.5: 4,
    3.0: 5,
    3.5: 6,
    4.0: 7,
    4.5: 8,
    5.0: 9,
    5.5: 10,
    6.0: 11
}


train_data = pd.read_csv('/kaggle/input/finaldf/final_df (3).csv')


# Apply the mapping to the 'score' column
train_data['score'] = train_data['score'].map(fractional_to_integer_mapping)

train_data.head()

Unnamed: 0,id,num_backspaces,duration_backspaces,num_cut/copy/paste,discarded_text,D/I Ratio,Proportion_of_Deletions,no_distant_revision,no_immediate_revision,major_edits_count,...,Total_Pause_Time_within_word,Mean_Pause_Time_within_word,SD_Pause_Time_within_word,Total_Pause_Time_Before_Sentences,Mean_Pause_Time_Before_Sentences,SD_Pause_Time_Before_Sentences,Total_Pause_Time_in_sentences,Mean_Pause_Time_in_sentences,SD_Pause_Time_in_sentences,score
0,001519c8,417.0,34130.0,417,489,0.243284,0.164512,2517,40.0,93,...,685007,1918.787115,7273.768138,3643,173.47619,416.665168,1181819,56277.095238,43455.913672,6
1,0022f953,260.0,23550.0,261,266,0.138287,0.097316,2354,100.0,45,...,1087710,2781.867008,16954.357992,69412,4627.466667,8278.455203,581673,38778.2,32758.88381,6
2,0042269b,439.0,32905.0,439,935,0.266003,0.221984,4037,99.0,173,...,843679,1528.403986,9793.22683,87089,4147.095238,15826.690943,1259632,59982.47619,58697.726407,11
3,0059420b,152.0,18411.0,152,163,0.134202,0.092302,1473,83.0,19,...,745392,3067.45679,13400.325753,1466851,112834.692308,365509.369147,1032626,79432.769231,48355.056728,3
4,0075873a,517.0,40199.0,517,517,0.26622,0.164806,2490,41.0,124,...,1039641,3208.768519,11181.242502,142133,6179.695652,13762.720184,1073221,46661.782609,73950.739406,7


In [2]:

X_train, X_valid, y_train, y_valid = train_test_split(
    train_data.drop(['score', 'id'], axis=1),
    train_data['score'],
    test_size=0.2,
    random_state=42
)


X_train.head()
y_train.head()

1610    8
1525    9
2366    4
859     5
497     3
Name: score, dtype: int64

In [3]:
# Create an XGBoost classifier model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=12, seed=42, max_depth= 4, learning_rate= 0.05, n_estimators= 90)


# Train the model and Make predictions
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)


# Inverse mapping dictionary
integer_to_fractional_mapping = {v: k for k, v in fractional_to_integer_mapping.items()}

# Convert predicted values back to fractional scores
y_pred_original = [integer_to_fractional_mapping[int(prediction)] for prediction in y_pred]

# Convert true values back to fractional scores
y_valid_original = [integer_to_fractional_mapping[int(true_value)] for true_value in y_valid]


# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_valid_original, y_pred_original))
print(f'RMSE: {rmse:.4f}')

# Display a table with expected and actual scores
score_table = pd.DataFrame({'Expected Score': y_valid_original, 'Actual Score': y_pred_original})
print(tabulate(score_table, headers='keys', tablefmt='psql'))

RMSE: 0.7286
+-----+------------------+----------------+
|     |   Expected Score |   Actual Score |
|-----+------------------+----------------|
|   0 |              5   |            4.5 |
|   1 |              1.5 |            3   |
|   2 |              3   |            3   |
|   3 |              3   |            4   |
|   4 |              3.5 |            3.5 |
|   5 |              3.5 |            4   |
|   6 |              4   |            4   |
|   7 |              2.5 |            2.5 |
|   8 |              4.5 |            4   |
|   9 |              4   |            4.5 |
|  10 |              4.5 |            4   |
|  11 |              5   |            4.5 |
|  12 |              3   |            3   |
|  13 |              4.5 |            4.5 |
|  14 |              3.5 |            4   |
|  15 |              3.5 |            4   |
|  16 |              4   |            3.5 |
|  17 |              4   |            4   |
|  18 |              1.5 |            2.5 |
|  19 |            