In [21]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

In [22]:
#nltk.download('wordnet')

In [23]:
df = pd.read_csv('answers.csv')

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return words

df['processed_answers'] = df['answer'].apply(preprocess_text)

In [24]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

w2v_model_cbow = Word2Vec(sentences=df['processed_answers'], vector_size=100, window=5, min_count=1, sg=0)
w2v_model_sg = Word2Vec(sentences=df['processed_answers'], vector_size=100, window=5, min_count=1, sg=1)

vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(df['processed_answers'].apply(lambda x: ' '.join(x)))

vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['processed_answers'].apply(lambda x: ' '.join(x)))

In [25]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

y = df['score']  # Assuming 'scores' is the target column

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

mse_svr = mean_squared_error(y_test, y_pred_svr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_dt = mean_squared_error(y_test, y_pred_dt)

rmse_svr = mean_squared_error(y_test, y_pred_svr, squared=False)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
rmse_dt = mean_squared_error(y_test, y_pred_dt, squared=False)

mae_svr = mean_absolute_error(y_test, y_pred_svr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mae_dt = mean_absolute_error(y_test, y_pred_dt)

r2_svr = r2_score(y_test, y_pred_svr)
r2_lr = r2_score(y_test, y_pred_lr)
r2_dt = r2_score(y_test, y_pred_dt)

# results interpretaions
print(f'SVR MSE: {mse_svr}, RMSE: {rmse_svr}, MAE: {mae_svr:.3f}, R²: {r2_svr:.3f}')
print(f'Linear Regression MSE: {mse_lr}, RMSE: {rmse_lr}, MAE: {mae_lr:.3f}, R²: {r2_lr:.3f}')
print(f'Decision Tree MSE: {mse_dt}, RMSE: {rmse_dt}, MAE: {mae_dt:.3f}, R²: {r2_dt:.3f}')

SVR MSE: 1.0912853208973625, RMSE: 1.044646026602965, MAE: 0.723, R²: 0.147
Linear Regression MSE: 5.37334611648882, RMSE: 2.3180479107405914, MAE: 1.515, R²: -3.201
Decision Tree MSE: 1.4930769999078364, RMSE: 1.221915299809212, MAE: 0.768, R²: -0.167


SVR performs the best among the three models based on RMSE and R². It has the lowest errors and explains the highest proportion of variance in the data, although still relatively low.
Linear regression and decision tree regression perform poorly, with negative R² values indicating poor model fits. These models might be suffering from overfitting or underfitting issues, or the data might not be suitable for linear models.
It's important to further investigate the features, model complexity, and data characteristics to understand why SVR outperforms the other models and how to potentially improve model performance. This could involve feature engineering, hyperparameter tuning, or exploring more advanced regression techniques.