In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

# Function to load datasets
def load_datasets():
    custom_submission_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv")
    custom_test_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
    custom_train_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
    custom_external_train_df = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv", sep=',')
    return custom_submission_df, custom_test_df, custom_train_df, custom_external_train_df

# Function to preprocess data
def preprocess_data(external_train_df, test_df):
    # Rename the columns
    external_train_df = external_train_df.rename(columns={'essay_id': 'id', 'label': 'generated', 'prompt': 'prompt_id'})
    # Convert the prompt_id into numerical form
    external_train_df['prompt_id'] = pd.factorize(external_train_df['prompt_id'])[0]
    # Drop unnecessary columns
    external_train_df = external_train_df.drop(["source", "fold"], axis=1)
    # Concatenate text data
    df = pd.concat([external_train_df['text'], test_df['text']], axis=0)
    # Vectorize the text data
    vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
    X = vectorizer.fit_transform(df)
    return X, external_train_df

# Function to train and evaluate a model
def train_and_evaluate(model, X, y, folds=5):
    cv = StratifiedKFold(n_splits=folds)
    roc_scores = []

    for train_index, val_index in cv.split(X[:len(y)], y):
        X_train_fold, X_val_fold = X[:len(y)][train_index], X[:len(y)][val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        model.fit(X_train_fold, y_train_fold)
        roc_score = roc_auc_score(y_val_fold, model.predict_proba(X_val_fold)[:, 1])
        roc_scores.append(roc_score)

    average_roc_score = sum(roc_scores) / len(roc_scores)
    print(f"Average ROC AUC score for {type(model).__name__}: {average_roc_score}")

# Function to train and predict using an ensemble of models
def train_and_predict_ensemble(models, X_train, y_train, X_test):
    ensemble_clf = VotingClassifier(estimators=models, voting='soft')
    ensemble_clf.fit(X_train, y_train)

    # Print the accuracy score
    print(f'Accuracy: {roc_auc_score(y_train, ensemble_clf.predict_proba(X_train)[:, 1])}\n')

    # Predictions on training and test sets
    preds_train = ensemble_clf.predict_proba(X_train)[:, 1]
    preds_test = ensemble_clf.predict_proba(X_test)[:, 1]

    # Print ROC AUC score on the training set
    print('ROC AUC train:', roc_auc_score(y_train, preds_train))

    return preds_train, preds_test

# Main block to execute the code
if __name__ == "__main__":
    # Load datasets
    custom_submission_df, custom_test_df, custom_train_df, custom_external_train_df = load_datasets()

    # Preprocess data
    custom_X, custom_y = preprocess_data(custom_external_train_df, custom_test_df)

    # Create instances of models
    logistic_model = LogisticRegression()
    xgb_model = XGBClassifier()
    gbc_model = GradientBoostingClassifier()
    adbc_model = AdaBoostClassifier()
    bayes_model = MultinomialNB(alpha=0.02)
    sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=42)
    rf_model = RandomForestClassifier()

    # Train and evaluate each model
    train_and_evaluate(logistic_model, custom_X, custom_y['generated'])
    train_and_evaluate(xgb_model, custom_X, custom_y['generated'])
    train_and_evaluate(gbc_model, custom_X, custom_y['generated'])
    train_and_evaluate(adbc_model, custom_X, custom_y['generated'])
    train_and_evaluate(bayes_model, custom_X, custom_y['generated'])
    train_and_evaluate(sgd_model, custom_X, custom_y['generated'])
    train_and_evaluate(rf_model, custom_X, custom_y['generated'])

    # Create an ensemble classifier
    models = [('lr', logistic_model), ('xgb', xgb_model), ('gbc', gbc_model), ('adbc', adbc_model),
              ('bayes', bayes_model), ('sgd', sgd_model), ('rf', rf_model)]

    # Predictions using ensemble classifier
    preds_train, preds_test = train_and_predict_ensemble(models, custom_X[:len(custom_y)], custom_y['generated'], custom_X[len(custom_y['generated']):])

    # Save predictions to a CSV file
    pd.DataFrame({'id': custom_test_df["id"], 'generated': preds_test}).to_csv('submission.csv', index=False)




Average ROC AUC score for LogisticRegression: 0.9976010814307481
Average ROC AUC score for XGBClassifier: 0.9983312582364613
Average ROC AUC score for GradientBoostingClassifier: 0.9912243255143443
Average ROC AUC score for AdaBoostClassifier: 0.9867705747013114
Average ROC AUC score for MultinomialNB: 0.9909557638516768
Average ROC AUC score for SGDClassifier: 0.9981264440597704
Average ROC AUC score for RandomForestClassifier: 0.9958137984279647
Accuracy: 0.9998960544367967

ROC AUC train: 0.9998960544367967
