# Preprocessing

In [89]:
# Import necessary libraries
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_log_error

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yasmine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yasmine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yasmine\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [62]:
# Load the CSV file into a DataFrame
df = pd.read_csv('answers.csv')
df

Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0
...,...,...,...,...
2437,12.1,log n,5.0,1.0
2438,12.1,minus 1 divided by 2,1.5,0.0
2439,12.1,2n-1,2.5,0.0
2440,12.1,"it takes at most h steps, where h is the heigh...",5.0,1.0


In [63]:
df.isnull().sum()

id         0
answer     0
score      0
correct    0
dtype: int64

In [65]:
df.duplicated().sum()

101

In [66]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [67]:
# Function to clean and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Apply the preprocessing function to the 'answer' column
df['answer'] = df['answer'].apply(preprocess_text)

# Display the the DataFrame to verify the cleaning
df

Unnamed: 0,id,answer,score,correct
0,1.1,"[high, risk, problems, address, prototype, pro...",3.5,0.0
1,1.1,"[simulate, portions, desired, final, product, ...",5.0,1.0
2,1.1,"[prototype, program, simulates, behaviors, por...",4.0,1.0
3,1.1,"[defined, specification, phase, prototype, sti...",5.0,1.0
4,1.1,"[used, let, users, first, idea, completed, pro...",3.0,0.0
...,...,...,...,...
2436,12.1,"[three, steps, visit, root, node, go, right, s...",2.5,0.0
2438,12.1,"[minus, 1, divided, 2]",1.5,0.0
2439,12.1,[2n1],2.5,0.0
2440,12.1,"[takes, h, steps, h, height, tree]",5.0,1.0


In [68]:
# Function for lemmatization
def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

# Apply the lemmatization function to the 'cleaned_answer' column
df['answer'] = df['answer'].apply(lemmatize_words)

# Display the DataFrame to verify the lemmatization
df

Unnamed: 0,id,answer,score,correct
0,1.1,"[high, risk, problem, address, prototype, prog...",3.5,0.0
1,1.1,"[simulate, portion, desired, final, product, q...",5.0,1.0
2,1.1,"[prototype, program, simulates, behavior, port...",4.0,1.0
3,1.1,"[defined, specification, phase, prototype, sti...",5.0,1.0
4,1.1,"[used, let, user, first, idea, completed, prog...",3.0,0.0
...,...,...,...,...
2436,12.1,"[three, step, visit, root, node, go, right, su...",2.5,0.0
2438,12.1,"[minus, 1, divided, 2]",1.5,0.0
2439,12.1,[2n1],2.5,0.0
2440,12.1,"[take, h, step, h, height, tree]",5.0,1.0


# Word Encoding

In [69]:
# Train Word2Vec model
word2vec_model = Word2Vec(df['answer'], vector_size=100, window=5, min_count=1, sg=0)

# Example: Get the vector representation of a word
vector = word2vec_model.wv['program']

# Display the vector
print(f"Vector representation of 'program':\n{vector}")


Vector representation of 'program':
[-0.04838968  0.44182175  0.34314838 -0.18598397 -0.23654175 -0.6796358
 -0.02336263  0.84969777 -0.20209682 -0.14216046  0.02737346 -0.98115516
 -0.32080314  0.3662288   0.15941283 -0.15600725  0.3699693  -1.2318718
 -0.4545691  -1.4187831  -0.24871026  0.05136437  0.4368837   0.11439198
  0.07419373  0.09113833 -0.37396464 -0.67294383  0.05283495  0.29704735
  0.552261    0.42565694  0.58837885 -0.22623849 -0.31563997  0.53485036
  0.35258475 -0.7146062  -0.3346563  -1.0922918  -0.41466123 -0.42005876
 -0.20010579 -0.32497382  0.5247831  -0.13529062 -0.66144145 -0.06646062
  0.37047827  0.727866   -0.53790164 -0.25230443 -0.37266022 -0.01862064
 -0.3249252   0.4615993   0.3486918  -0.17448154 -0.58801013 -0.05502809
  0.48337638 -0.40929538 -0.1615698  -0.28404194 -0.5691435   0.5345974
  0.0920028   0.20106657 -0.8820008   0.63046384 -0.19646302  0.6824702
  0.35508808 -0.6793402   0.4835645   0.25648785 -0.1658785  -0.0194445
 -0.4579024   0.0721

In [70]:
# Function to encode text using Word2Vec model
def encode_text(text, model):
    encoded_text = []
    for word in text:
        if word in model.wv:
            encoded_text.append(model.wv[word])
        else:
            encoded_text.append([0] * model.vector_size)  # Use zeros for out-of-vocabulary words
    
    if encoded_text:
        return np.mean(encoded_text, axis=0)  # Average the word vectors to get a single vector for the text
    else:
        return np.zeros(model.vector_size)  # Return zeros if all words are out-of-vocabulary

# Apply encoding to the 'cleaned_answer' column
df['encoded_answer'] = df['answer'].apply(lambda x: encode_text(x, word2vec_model))

In [71]:
df

Unnamed: 0,id,answer,score,correct,encoded_answer
0,1.1,"[high, risk, problem, address, prototype, prog...",3.5,0.0,"[-0.018937118, 0.17550091, 0.13739917, -0.0772..."
1,1.1,"[simulate, portion, desired, final, product, q...",5.0,1.0,"[-0.016355198, 0.13559118, 0.10280226, -0.0602..."
2,1.1,"[prototype, program, simulates, behavior, port...",4.0,1.0,"[-0.020430826, 0.15291797, 0.11684523, -0.0696..."
3,1.1,"[defined, specification, phase, prototype, sti...",5.0,1.0,"[-0.016623823, 0.13459547, 0.10541942, -0.0590..."
4,1.1,"[used, let, user, first, idea, completed, prog...",3.0,0.0,"[-0.014033516, 0.119458586, 0.09507439, -0.050..."
...,...,...,...,...,...
2436,12.1,"[three, step, visit, root, node, go, right, su...",2.5,0.0,"[-0.039760523, 0.19619799, 0.15252176, -0.0809..."
2438,12.1,"[minus, 1, divided, 2]",1.5,0.0,"[-0.024667714, 0.1625036, 0.12136579, -0.06115..."
2439,12.1,[2n1],2.5,0.0,"[0.0015983379, 0.0011841882, -0.006455945, -0...."
2440,12.1,"[take, h, step, h, height, tree]",5.0,1.0,"[-0.03376695, 0.17881703, 0.13941275, -0.08069..."


# Models training

In [72]:
# Split the data into features (X) and target (y)
X = np.array(df['encoded_answer'].tolist())
y = df['score']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Support Vector Regression (SVR)

In [73]:
# Train SVR model
svr_model = make_pipeline(StandardScaler(), SVR())
svr_model.fit(X_train, y_train)

# Predict on test data
y_pred_svr = svr_model.predict(X_test)

In [90]:
# Calculate MSE
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f"SVR MSE: {mse_svr}")

# Calculate RMSE
rmse_svr = np.sqrt(mse_svr)
print(f"SVR RMSE: {rmse_svr}")

# Calculate MAE
mae_svr = mean_absolute_error(y_test, y_pred_svr)
print(f"SVR MAE: {mae_svr}")

# Calculate R-squared
r2_svr = r2_score(y_test, y_pred_svr)
print(f"SVR R-squared: {r2_svr}")

# Calculate MSLE
msle_svr = mean_squared_log_error(y_test, y_pred_svr)
print(f"SVR MSLE: {msle_svr}")

SVR MSE: 1.2036997271148402
SVR RMSE: 1.097132502077502
SVR MAE: 0.7337374240868745
SVR R-squared: 0.008811574256558652
SVR MSLE: 0.07499626759683131


### Naive Bayes

In [82]:
from sklearn.preprocessing import KBinsDiscretizer

# Discretize the target variable 'y' into bins
n_bins = 10  # You can adjust the number of bins
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform', subsample=None)
y_binned = discretizer.fit_transform(y.values.reshape(-1, 1)).ravel()

In [83]:
# Train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_binned[:len(X_train)])

# Predict on test data
y_pred_nb = nb_model.predict(X_test)

# Transform predictions back to the original scale
y_pred_nb_continuous = discretizer.inverse_transform(y_pred_nb.reshape(-1, 1)).ravel()

In [91]:
# Calculate MSE
mse_nb = mean_squared_error(y_test, y_pred_nb_continuous)
print(f"Naive Bayes MSE: {mse_nb}")

# Calculate RMSE
rmse_nb = np.sqrt(mse_nb)
print(f"Naive Bayes RMSE: {rmse_nb}")

# Calculate MAE
mae_nb = mean_absolute_error(y_test, y_pred_nb_continuous)
print(f"Naive Bayes MAE: {mae_nb}")

# Calculate R-squared
r2_nb = r2_score(y_test, y_pred_nb_continuous)
print(f"Naive Bayes R-squared: {r2_nb}")

# Calculate MSLE
msle_nb = mean_squared_log_error(y_test, y_pred_nb_continuous)
print(f"Naive Bayes MSLE: {msle_nb}")

Naive Bayes MSE: 6.955656982942431
Naive Bayes RMSE: 2.6373579550266646
Naive Bayes MAE: 2.2571961620469083
Naive Bayes R-squared: -4.727646637803315
Naive Bayes MSLE: 0.6380672162425051


### Linear Regression

In [85]:
# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on test data
y_pred_lr = lr_model.predict(X_test)

In [92]:
# Calculate MSE
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression MSE: {mse_lr}")

# Calculate RMSE
rmse_lr = np.sqrt(mse_lr)
print(f"Linear Regression RMSE: {rmse_lr}")

# Calculate MAE
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print(f"Linear Regression MAE: {mae_lr}")

# Calculate R-squared
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression R-squared: {r2_lr}")

# Calculate MSLE
msle_lr = mean_squared_log_error(y_test, y_pred_lr)
print(f"Linear Regression MSLE: {msle_lr}")

Linear Regression MSE: 1.0405568258481217
Linear Regression RMSE: 1.020076872518989
Linear Regression MAE: 0.8006471438904936
Linear Regression R-squared: 0.14315185184835433
Linear Regression MSLE: 0.0621649660399366


### Decision Tree Algorithms

In [87]:
# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on test data
y_pred_dt = dt_model.predict(X_test)

In [93]:
# Calculate MSE
mse_dt = mean_squared_error(y_test, y_pred_dt)
print(f"Decision Tree MSE: {mse_dt}")

# Calculate RMSE
rmse_dt = np.sqrt(mse_dt)
print(f"Decision Tree RMSE: {rmse_dt}")

# Calculate MAE
mae_dt = mean_absolute_error(y_test, y_pred_dt)
print(f"Decision Tree MAE: {mae_dt}")

# Calculate R-squared
r2_dt = r2_score(y_test, y_pred_dt)
print(f"Decision Tree R-squared: {r2_dt}")

# Calculate MSLE
msle_dt = mean_squared_log_error(y_test, y_pred_dt)
print(f"Decision Tree MSLE: {msle_dt}")

Decision Tree MSE: 1.771523804384218
Decision Tree RMSE: 1.3309860271183234
Decision Tree MAE: 0.9056163985268463
Decision Tree R-squared: -0.45876405159897526
Decision Tree MSLE: 0.10088035955379121


# Interpretation of the Obtained Results

<h3>SVR (Support Vector Regression)</h3>
<ul>
<li><strong>MSE (Mean Squared Error):</strong> 1.2037</li>
<li><strong>RMSE (Root Mean Squared Error):</strong> 1.0971</li>
<li><strong>MAE (Mean Absolute Error):</strong> 0.7337</li>
<li><strong>R-squared:</strong> 0.0088 <span class="interpretation">Interpretation: The R-squared value is very close to zero, indicating the model explains very little of the variance in the target variable. It suggests limited explanatory power.</span></li>
<li><strong>MSLE (Mean Squared Logarithmic Error):</strong> 0.0750</li>
</ul>

<p>Interpretation: The SVR model exhibits errors of 1.2037 for MSE, 1.0971 for RMSE, and 0.7337 for MAE. These values indicate a moderate level of error in terms of absolute differences between predicted and actual values. However, the R-squared value being very close to zero suggests the model struggles to capture the underlying relationships in your data, limiting its explanatory power.

<h3>Naive Bayes</h3>
<ul>
<li><strong>MSE (Mean Squared Error):</strong> 6.9556</li>
<li><strong>RMSE (Root Mean Squared Error):</strong> 2.6374</li>
<li><strong>MAE (Mean Absolute Error):</strong> 2.2572</li>
<li><strong>R-squared:</strong> -4.7276 <span class="interpretation">Interpretation: The R-squared value is highly negative, indicating very poor performance, far worse than a simple mean prediction.</span></li>
<li><strong>MSLE (Mean Squared Logarithmic Error):</strong> 0.6381</li>
</ul>

<p>Interpretation: The Naive Bayes model exhibits errors of 6.9556 for MSE, 2.6374 for RMSE, and 2.2572 for MAE. These values indicate relatively high errors in terms of absolute differences between predicted and actual values. The highly negative R-squared value further reinforces that the model performs poorly, failing to capture the underlying relationships in your data and potentially performing worse than a simple average prediction.</p>

<h3>Linear Regression</h3>
<ul>
<li><strong>MSE (Mean Squared Error):</strong> 1.0406</li>
<li><strong>RMSE (Root Mean Squared Error):</strong> 1.0201</li>
<li><strong>MAE (Mean Absolute Error):</strong> 0.8006</li>
<li><strong>R-squared:</strong> 0.1432 <span class="interpretation">Interpretation: The R-squared value is positive and closer to 0.25, indicating a slight improvement in explaining the variance in the target variable compared to a simple mean prediction.</span></li>
<li><strong>MSLE (Mean Squared Logarithmic Error):</strong> 0.0622</li>
</ul>

<p>Interpretation: The Linear Regression model shows improvement in terms of MSE (1.0406), RMSE (1.0201), and MAE (0.8006) compared to the previous results. These values indicate a lower level of error in absolute differences between predicted and actual values. The R-squared value (0.1432) is now slightly positive and closer to 0.25, suggesting a small improvement in capturing some of the variance in the target variable compared to a simple mean prediction.</p>

<h3>Decision Tree</h3>
<ul>
<li><strong>MSE (Mean Squared Error):</strong> 1.7715</li>
<li><strong>RMSE (Root Mean Squared Error):</strong> 1.3310</li>
<li><strong>MAE (Mean Absolute Error):</strong> 0.9056</li>
<li><strong>R-squared:</strong> -0.4588 <span class="interpretation">Interpretation: The R-squared value is highly negative, indicating poor performance in explaining the variance in the target variable. While it's not as negative as Naive Bayes, it suggests the model still struggles to capture the underlying relationships.</span></li>
<li><strong>MSLE (Mean Squared Logarithmic Error):</strong> 0.1009</li>
</ul>

<p>Interpretation: The Decision Tree model exhibits errors of 1.7715 for MSE, 1.3310 for RMSE, and 0.9056 for MAE. These values indicate a moderate level of error in terms of absolute differences between predicted and actual values. However, the highly negative R-squared value (-0.4588) suggests the model performs poorly in explaining the relationships within your data. While it might outperform Naive Bayes in this regard, it still struggles to capture the underlying patterns.</p>

<h3>Summary and Choice of Best Model</h3>

<p>Evaluating the Models: Here's a summary of the performance metrics for each model:</p>

<table>
<tr>
<th>Model</th>
<th>MSE</th>
<th>RMSE</th>
<th>MAE</th>
<th>R-squared</th>
</tr>
<tr>
<td>Linear Regression</td>
<td>1.0406</td>
<td>1.0201</td>
<td>0.8006</td>
<td>0.1432</td>
</tr>
<tr>
<td>SVR</td>
<td>1.2037</td>
<td>1.0971</td>
<td>0.7337</td>
<td>0.0088</td>
</tr>
<tr>
<td>Decision Tree</td>
<td>1.7715</td>
<td>1.3310</td>
<td>0.9056</td>
<td>-0.4588</td>
</tr>
<tr>
<td>Naive Bayes</td>
<td>6.9556</td>
<td>2.6374</td>
<td>2.2572</td>
<td>-4.7276</td>
</tr>
</table>

<p>Choosing the Best Model: Based on the metrics, Linear Regression seems to be the preferred choice at this stage. It exhibits the lowest MSE, RMSE, and MAE, indicating a lower level of error in absolute terms. While its R-squared value remains low, it's positive and shows a slight improvement compared to other models, suggesting some ability to capture the variance in the target variable.</p>