<a href="https://colab.research.google.com/github/ZackMaster-16/Plagiarism-checker-ai/blob/main/MyML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import zipfile  # For working with zip files
import pandas as pd  # For data manipulation and analysis
from sklearn.model_selection import train_test_split  # To split data into train/test sets
from sklearn.feature_extraction.text import TfidfVectorizer  # To convert text to numerical features
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier
from sklearn.svm import SVC  # Support Vector Machine classifier
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes classifier
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.metrics import accuracy_score  # To calculate model accuracy

In [None]:
# Unzip and load the data
zip_path = 'archive.zip'
with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open('train_snli.txt') as f:
        df = pd.read_csv(f, sep='\t', header=None, names=['sentence1', 'sentence2', 'label'])

In [None]:
# Basic data cleaning
df = df.dropna()
df['sentence1'] = df['sentence1'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
df['sentence2'] = df['sentence2'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
df['text'] = df['sentence1'] + ' ' + df['sentence2']

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)


In [None]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM (Linear Kernel)': SVC(kernel='linear'),
    'Multinomial NB': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
}

In [None]:
# Train and evaluate
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train_tfidf, y_train)
    preds = clf.predict(X_test_tfidf)
    acc = accuracy_score(y_test, preds)
    results[name] = acc

In [None]:

# Present results
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy']).sort_values(by='Accuracy', ascending=False)
# import ace_tools as tools;
# tools.display_dataframe_to_user(name="Model Accuracies", dataframe=results_df)
print(results_df)

In [None]:
# Print best model
best_model = results_df.index[0]
best_acc = results_df.iloc[0, 0]
print(f"Best model: {best_model} with accuracy {best_acc:.4f}")