In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
df = pd.read_csv("../data/cleaned_plagiarism_data.csv")

X = df['clean_text']
y = df['generated']

In [5]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9981818181818182
[[275   0]
 [  1 274]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      1.00      1.00       275

    accuracy                           1.00       550
   macro avg       1.00      1.00      1.00       550
weighted avg       1.00      1.00      1.00       550



In [9]:
import joblib

joblib.dump(model, "../models/plagiarism_model.pkl")
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")

['../models/tfidf_vectorizer.pkl']

In [11]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text_input(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

def predict_text(text):
    cleaned = clean_text_input(text)
    text_vec = vectorizer.transform([cleaned])
    pred = model.predict(text_vec)[0]
    return "AI Generated" if pred == 1 else "Human Written"

In [12]:
print(predict_text("This assignment was written carefully by the student."))
print(predict_text("Artificial intelligence systems can autonomously generate essays."))

AI Generated
AI Generated


In [13]:
def predict_with_confidence(text):
    cleaned = clean_text_input(text)
    vec = vectorizer.transform([cleaned])
    prob = model.predict_proba(vec)[0]
    pred = model.predict(vec)[0]
    return {
        "prediction": "AI Generated" if pred == 1 else "Human Written",
        "confidence": round(max(prob) * 100, 2)
    }

predict_with_confidence("This assignment was written carefully by the student.")

{'prediction': 'AI Generated', 'confidence': 77.63}