In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load and preprocess data
df = pd.read_csv('meq_data.csv')
df.fillna("", inplace=True)

# Combine question, optimal answer, and keywords into one column
df['question_keywords'] = df['QuestionText'] + ' ' + df['OptimalAnswer'] + ' ' + df['Keywords']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['question_keywords', 'StudentAnswer']], df['Score'], test_size=0.2, random_state=42)

# Transform text data
vectorizer = TfidfVectorizer()
X_train_question_keywords = vectorizer.fit_transform(X_train['question_keywords'])
X_train_student_answer = vectorizer.transform(X_train['StudentAnswer'])
X_test_question_keywords = vectorizer.transform(X_test['question_keywords'])
X_test_student_answer = vectorizer.transform(X_test['StudentAnswer'])

# Combine the transformed features
X_train_combined = hstack([X_train_question_keywords, X_train_student_answer])
X_test_combined = hstack([X_test_question_keywords, X_test_student_answer])

# Train model
model = LinearRegression()
model.fit(X_train_combined, y_train)

# Make predictions and evaluate model
predictions = model.predict(X_test_combined)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


In [1]:
import pandas as pd

# Create a dummy DataFrame with the structure defined
data = {
    'QuestionID': ['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3', 'Q3'],
    'StudentID': ['S1', 'S2', 'S3', 'S1', 'S2', 'S3', 'S1', 'S2', 'S3', 'S4'],
    'AnswerID': ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10'],
    'QuestionText': [
        'What is diabetes?',
        'What is diabetes?',
        'What is diabetes?',
        'Define hypertension',
        'Define hypertension',
        'Define hypertension',
        'Describe the process of photosynthesis',
        'Describe the process of photosynthesis',
        'Describe the process of photosynthesis',
        'Describe the process of photosynthesis'
    ],
    'OptimalAnswer': [
        'A chronic disease where the body cannot regulate blood sugar levels due to lack of insulin or insulin resistance.',
        'A chronic disease where the body cannot regulate blood sugar levels due to lack of insulin or insulin resistance.',
        'A chronic disease where the body cannot regulate blood sugar levels due to lack of insulin or insulin resistance.',
        'A condition where the blood pressure in the arteries is persistently elevated.',
        'A condition where the blood pressure in the arteries is persistently elevated.',
        'A condition where the blood pressure in the arteries is persistently elevated.',
        'The process by which green plants use sunlight to synthesize nutrients from carbon dioxide and water.',
        'The process by which green plants use sunlight to synthesize nutrients from carbon dioxide and water.',
        'The process by which green plants use sunlight to synthesize nutrients from carbon dioxide and water.',
        'The process by which green plants use sunlight to synthesize nutrients from carbon dioxide and water.'
    ],
    'Keywords': [
        'chronic, disease, insulin, sugar',
        'chronic, disease, insulin, sugar',
        'chronic, disease, insulin, sugar',
        'condition, blood pressure, high',
        'condition, blood pressure, high',
        'condition, blood pressure, high',
        'green, plants, sunlight, synthesize, nutrients, carbon dioxide, water',
        'green, plants, sunlight, synthesize, nutrients, carbon dioxide, water',
        'green, plants, sunlight, synthesize, nutrients, carbon dioxide, water',
        'green, plants, sunlight, synthesize, nutrients, carbon dioxide, water'
    ],
    'StudentAnswer': [
        'Diabetes is a chronic disease affecting blood sugar levels due to problems with insulin.',
        'Diabetes is when your blood sugar levels are too high because of issues with insulin.',
        'Diabetes is a condition where the body cannot manage blood sugar properly.',
        'Hypertension is a condition where the blood pressure is higher than normal.',
        'Hypertension means high blood pressure in the arteries.',
        'Hypertension is when the blood pressure stays high over time.',
        'Photosynthesis is how plants use sunlight to turn CO2 and water into food.',
        'Photosynthesis is the process plants use to make food using sunlight.',
        'Plants make food from sunlight and CO2, and this is called photosynthesis.',
        'Photosynthesis is when plants convert light energy into chemical energy.'
    ],
    'Score': [4, 3, 5, 4, 5, 3, 5, 4, 4, 5],
    'Comments': [
        'Good answer but missed mentioning insulin resistance.',
        'Lacks detail about insulin resistance.',
        'Excellent answer with all key points covered.',
        'Good explanation but could use more detail.',
        'Well explained with all necessary details.',
        'Needs more details about the persistent nature.',
        'Comprehensive answer with all key points.',
        'Good explanation but missing some details.',
        'Well explained but could be more detailed.',
        'Excellent explanation with all key points covered.'
    ]
}

df = pd.DataFrame(data)

# Export the DataFrame to a CSV file
csv_file_path = 'G:/My Drive/M7taso 7k/Project/AI/meq_data.csv'
df.to_csv(csv_file_path, index=False)

csv_file_path


'G:/My Drive/M7taso 7k/Project/AI/meq_data.csv'

In [2]:
import pandas as pd

df = pd.read_csv('./meq_data.csv')


In [3]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['StudentAnswer'])
X_test = vectorizer.transform(test_df['StudentAnswer'])


In [5]:
y_train = train_df['Score']
y_test = test_df['Score']


In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [7]:
from sklearn.svm import SVR

model = SVR()
model.fit(X_train, y_train)


In [8]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('meq_data.csv')

# Split the dataset into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# BERT-specific imports
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import torch

# Tokenizer and Model Initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Tokenization and Dataset Preparation
class MEQDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MEQDataset(train_df['StudentAnswer'].tolist(), train_df['Score'].tolist())
test_dataset = MEQDataset(test_df['StudentAnswer'].tolist(), test_df['Score'].tolist())

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Training
trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in 'g:\\My Drive\\M7taso 7k\\Project\\AI' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/yahyakassab/huggingface/39dae0e66f324f999ed8351713ad1263

100%|██████████| 3/3 [00:01<00:00,  2.78it/s][1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m -------------------------------------------------------------------------------

{'train_runtime': 4.8899, 'train_samples_per_second': 4.908, 'train_steps_per_second': 0.614, 'train_loss': 1.861700216929118, 'epoch': 3.0}


[1;38;5;39mCOMET INFO:[0m Please wait for metadata to finish uploading (timeout is 3600 seconds)
100%|██████████| 3/3 [00:04<00:00,  1.38s/it]


TrainOutput(global_step=3, training_loss=1.861700216929118, metrics={'train_runtime': 4.8899, 'train_samples_per_second': 4.908, 'train_steps_per_second': 0.614, 'total_flos': 234341699616.0, 'train_loss': 1.861700216929118, 'epoch': 3.0})

In [14]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Function to load the checkpoint
def load_ckp(checkpoint_path, model, optimizer):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, start_epoch, valid_loss_min

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Load the trained model parameters
model, optimizer, start_epoch, valid_loss_min = load_ckp('path_to_saved_model/best_model.pt', model, optimizer)

# Function to predict the score for a new answer
def predict_score(model, tokenizer, text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    # Move inputs to the same device as the model
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    
    # Put the model in evaluation mode
    model.eval()
    
    # Disable gradient calculation
    with torch.no_grad():
        # Forward pass, get logits
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Convert logits to predicted score (assuming classification)
        predicted_score = torch.argmax(logits, dim=1).item()
        
    return predicted_score

# Example new student answer
new_answer = "Your new student answer here."

# Predict the score
predicted_score = predict_score(model, tokenizer, new_answer)
print(predicted_score)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_saved_model/best_model.pt'

In [10]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R^2 Score:', r2_score(y_test, y_pred))


AttributeError: 'BertForSequenceClassification' object has no attribute 'predict'

In [11]:
# Evaluation with the Trainer
eval_results = trainer.evaluate()
print(eval_results)


100%|██████████| 1/1 [00:00<00:00, 1002.70it/s]

{'eval_loss': 2.0698540210723877, 'eval_runtime': 0.1574, 'eval_samples_per_second': 12.706, 'eval_steps_per_second': 6.353, 'epoch': 3.0}





In [12]:
import joblib

# Save Linear Regression/SVM Model
joblib.dump(model, 'meq_grading_model.pkl')

# Save BERT Model
model.save_pretrained('path_to_save_model')
tokenizer.save_pretrained('path_to_save_tokenizer')


('path_to_save_tokenizer\\tokenizer_config.json',
 'path_to_save_tokenizer\\special_tokens_map.json',
 'path_to_save_tokenizer\\vocab.txt',
 'path_to_save_tokenizer\\added_tokens.json')

In [13]:
# Load Linear Regression/SVM Model
loaded_model = joblib.load('meq_grading_model.pkl')
new_answer = "Your new student answer here."
new_answer_transformed = vectorizer.transform([new_answer])
predicted_score = loaded_model.predict(new_answer_transformed)
print(predicted_score)

# Load BERT Model
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('path_to_saved_model')
tokenizer = BertTokenizer.from_pretrained('path_to_saved_tokenizer')
inputs = tokenizer(new_answer, return_tensors='pt')
outputs = model(**inputs)
predicted_score = torch.argmax(outputs.logits, dim=1)
print(predicted_score.item())


AttributeError: 'BertForSequenceClassification' object has no attribute 'predict'