### Importing Modules

In [1]:
import nltk
nltk.download('popular')
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |   

### Loading Dataset

In [2]:
data = pd.read_csv("H:\Files\DS & DA PROJECTS\plagarism_dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [3]:
data['label'].value_counts()

0    187
1    183
Name: label, dtype: int64

In [4]:
data.shape

(370, 4)

### Cleaning Text

In [5]:
def preprocessing_text(text):
    # removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # converting to lowercase
    text = text.lower()
    # removing stopwords
    stop_words = set(stopwords.words('english'))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

# preprocessing_text("Hey, Hi; How are you? How have you been?")

In [None]:
data['source_text'] = data['source_text'].apply(preprocessing_text)
data['plagiarized_text'] = data['plagiarized_text'].apply(preprocessing_text)

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1
1,1,moon orbits earth approximately 273 days,natural satellite takes around 273 days comple...,1
2,2,water composed two hydrogen atoms one oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1
3,3,history rome dates back 753 bc,rome long history traced back 753 bc,1
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet suns planet...,1


### Vectorization

In [12]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['source_text'] + " " + data['plagiarized_text'])

In [13]:
y = data['label']

### Train Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state = 40)

### Applying LogisticRegression

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report: ", classification_report(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

Accuracy:  0.8243243243243243
Classification Report:                precision    recall  f1-score   support

           0       0.77      0.92      0.84        37
           1       0.90      0.73      0.81        37

    accuracy                           0.82        74
   macro avg       0.84      0.82      0.82        74
weighted avg       0.84      0.82      0.82        74

Confusion Matrix:  [[34  3]
 [10 27]]


### Applying Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=40)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report: ", classification_report(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

Accuracy:  0.7837837837837838
Classification Report:                precision    recall  f1-score   support

           0       0.70      1.00      0.82        37
           1       1.00      0.57      0.72        37

    accuracy                           0.78        74
   macro avg       0.85      0.78      0.77        74
weighted avg       0.85      0.78      0.77        74

Confusion Matrix:  [[37  0]
 [16 21]]


### Applying Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report: ", classification_report(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

Accuracy:  0.8378378378378378
Classification Report:                precision    recall  f1-score   support

           0       0.82      0.86      0.84        37
           1       0.86      0.81      0.83        37

    accuracy                           0.84        74
   macro avg       0.84      0.84      0.84        74
weighted avg       0.84      0.84      0.84        74

Confusion Matrix:  [[32  5]
 [ 7 30]]


### Applying Support Vector Machine

In [18]:
from sklearn.svm import SVC

model = SVC(kernel='linear', random_state=40)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report: ", classification_report(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

Accuracy:  0.8513513513513513
Classification Report:                precision    recall  f1-score   support

           0       0.82      0.89      0.86        37
           1       0.88      0.81      0.85        37

    accuracy                           0.85        74
   macro avg       0.85      0.85      0.85        74
weighted avg       0.85      0.85      0.85        74

Confusion Matrix:  [[33  4]
 [ 7 30]]


### Saving SVM model and Vectorizer

In [20]:
import pickle
pickle.dump(model,open('H:\Files\DS & DA PROJECTS\model.pkl', 'wb'))
pickle.dump(tfidf_vectorizer,open('H:\Files\DS & DA PROJECTS\ztfidf_vectorizer.pkl', 'wb'))

### Loading Model and Vectorizer

In [22]:
import pickle
model = pickle.load(open('H:\Files\DS & DA PROJECTS\model.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('H:\Files\DS & DA PROJECTS\ztfidf_vectorizer.pkl', 'rb'))

### Detection System

In [23]:
def detect(input_text):
    # vectozied the text
    vectorized_text = tfidf_vectorizer.transform([input_text])
    # prediction by the model
    result = model.predict(vectorized_text)
    return "Plagarism Detected" if result[0] == 1 else "No Plagarism"

In [31]:
# example
input_text = 'The human brain consists of billions of neurons.'
detect(input_text)

'Plagarism Detected'