In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
train_df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv")
test_df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")
sub = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/sample_submission.csv")


# Split the data into training and validation sets
train_text = train_df['prompt']
train_labels = train_df['answer']
X_train, X_val, y_train, y_val = train_test_split(train_text, train_labels, test_size=0.1, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(lowercase=True, strip_accents='unicode', stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# Train a random forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vec, y_train)


# Vectorize the test data using the same vectorizer
X_test_vec = vectorizer.transform(test_df['prompt'])

# Make predictions on the test data
predictions = model.predict_proba(X_test_vec)

# Get the three best choices for each prediction
best_choices = []
for prediction in predictions:
    top_choices = prediction.argsort()[-3:][::-1]
    best_choices.append([model.classes_[choice] for choice in top_choices])

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'id': test_df['id'], 'prediction': best_choices})

# Format predictions as per submission requirements
predictions_df['prediction'] = predictions_df['prediction'].apply(lambda x: ' '.join(x))

# Save the predictions to a submission file
predictions_df.to_csv('submission.csv', index=False)

# Evaluate the predictions using Mean Average Precision @ 3 (MAP@3)
def calculate_map3(true_labels, predicted_labels):
    map3 = 0
    for true, pred in zip(true_labels, predicted_labels):
        relevant_labels = set(true.split())
        precision = 0
        num_predictions = 0
        for i, label in enumerate(pred.split()):
            if label in relevant_labels:
                precision += 1
                relevant_labels.remove(label)
                num_predictions += 1
            if num_predictions == 3:
                break
        map3 += precision / 3
    map3 /= len(true_labels)
    return map3

# Example usage:
true_labels = ['A B C', 'B', 'C A B']
predicted_labels = ['A B C', 'A', 'B C']
map3_score = calculate_map3(true_labels, predicted_labels)
print("MAP@3 Score:", map3_score)
from sklearn.model_selection import cross_val_score



# Perform cross-validation
scores = cross_val_score(model, X_train_vec, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean Cross-Validation Score:", np.mean(scores))

# Fit the model on the entire training data
model.fit(X_train_vec, y_train)

# Vectorize the test data using the same vectorizer
X_test_vec = vectorizer.transform(test_df['prompt'])

# Make predictions on the test data
predictions = model.predict_proba(X_test_vec)

# Get the three best choices for each prediction
best_choices = []
for prediction in predictions:
    top_choices = prediction.argsort()[-3:][::-1]
    best_choices.append([model.classes_[choice] for choice in top_choices])

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'id': test_df['id'], 'prediction': best_choices})

# Format predictions as per submission requirements
predictions_df['prediction'] = predictions_df['prediction'].apply(lambda x: ' '.join(x))

# Save the predictions to a submission file
predictions_df.to_csv('submission.csv', index=False)

# Evaluate the predictions using Mean Average Precision @ 3 (MAP@3)
map3_score = calculate_map3(y_val, predictions_df['prediction'])
print("MAP@3 Score:", map3_score)



MAP@3 Score: 0.5555555555555555
Cross-Validation Scores: [0.25       0.19444444 0.30555556 0.19444444 0.22222222]
Mean Cross-Validation Score: 0.2333333333333333
MAP@3 Score: 0.21666666666666665
