In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from gensim.models import Word2Vec
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from pyemd import emd
from spellchecker import SpellChecker

# Load the dataset
df = pd.read_csv('iot.csv')  # Replace 'your_dataset.csv' with the actual filename

# Initialize spell checker
spell = SpellChecker()

# Calculate feature values for each student answer
def calculate_features(answer, ideal_answer):
    features = []
    
    # Word count feature
    word_count = len(word_tokenize(answer))
    features.append(word_count)

    # Spelling error feature
    spelling_errors = calculate_spelling_errors(answer)
    features.append(spelling_errors)

    # Keywords matching feature
    keywords_matching = calculate_keywords_matching(answer, ideal_answer)
    features.append(keywords_matching)
    
    # Noun count feature
    noun_count = count_pos_tags_function(answer, 'NN')
    features.append(noun_count)

    # Verb count feature
    verb_count = count_pos_tags_function(answer, 'VB')
    features.append(verb_count)

    # Adjective count feature
    adj_count = count_pos_tags_function(answer, 'JJ')
    features.append(adj_count)
    
    # Word movers distance feature
    wmd_score = calculate_wmd_score(answer, ideal_answer, word2vec_model)
    features.append(wmd_score)

    return features

# Calculate spelling errors using SpellChecker
def calculate_spelling_errors(Test_Answer):
    words = word_tokenize(Test_Answer)
    misspelled = spell.unknown(words)
    return len(misspelled)

# Calculate keywords matching
def calculate_keywords_matching(Test_Answer, Standard_Answer):
    keywords = ['keyword1', 'keyword2', 'keyword3']  # Replace with your specific keywords
    answer_words = set(word_tokenize(Test_Answer.lower()))
    ideal_words = set(word_tokenize(Standard_Answer.lower()))
    matching_keywords = answer_words.intersection(keywords)
    return len(matching_keywords) / len(ideal_words)





In [2]:
# Count the number of words with a specific POS tag
def count_pos_tags_function(answer, pos_tag):
    tagged_answer = pos_tag(word_tokenize(answer))
    pos_count = len([word for word, pos in tagged_answer if pos.startswith(pos_tag)])
    return pos_count

In [3]:
# Calculate word movers distance between two sentences
def calculate_wmd(answer, ideal_answer, word2vec_model):
    s1 = word_tokenize(answer)
    s2 = word_tokenize(ideal_answer)
    return word2vec_model.wmdistance(s1, s2)

In [4]:
# Load the word2vec model (pre-trained or train your own)
word2vec_model = Word2Vec.load('word2vec.bin')  # Replace 'your_word2vec_model.bin' with the actual filename

In [5]:
# Create the feature matrix and target variable
X = []
y = []




In [6]:
for i, row in df.iterrows():
    answer = row['Test_Answer']
    ideal_answer = row['Standard_Answer']
    features = calculate_features(answer, ideal_answer)
    X.append(features)
    y.append(row['score'])

TypeError: 'str' object is not callable

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.