Import necessary libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

Load the dataset

In [None]:
data = pd.read_csv('text.csv')

In [None]:
print("loading done")

Split the dataset into features (X) and labels (y)

In [None]:
X = data['text']
y = data['generated']

Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Splitting done")

Create a TF-IDF vectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

Fit and transform the training data

In [None]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

Apply oversampling using SMOTE

In [None]:
oversampler = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_tfidf, y_train)

Train a Logistic Regression model on the resampled data

In [None]:
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

Save the trained model and vectorizer

In [None]:
joblib.dump(model, 'ai_detection_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

Evaluation on the test set

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = model.predict(X_test_tfidf)

Print evaluation metrics

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))