In [1]:
import nltk

In [2]:
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\overj\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\overj\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\overj\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\overj\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\overj\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

True

In [3]:
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
data = pd.read_csv("dataset.csv")

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [6]:
data['label'].value_counts()

label
0    187
1    183
Name: count, dtype: int64

In [7]:
def preprocess_text(text):
    text = text.translate(str.maketrans("","",string.punctuation))
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = " ".join((word for word in text.split() if word not in stop_words))
    return text
    
preprocess_text("This,.,.,.,.,.,.,.,. is ##@$$^* text for DUmmy Text")

'text dummy text'

In [8]:
data['source_text'] = data['source_text'].apply(preprocess_text)
data['plagiarized_text'] = data['plagiarized_text'].apply(preprocess_text)

In [9]:
tfidf_vectorizer = TfidfVectorizer()
x = tfidf_vectorizer.fit_transform(data['source_text']+" "+data["plagiarized_text"])

In [10]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2152 stored elements and shape (370, 1112)>
  Coords	Values
  (0, 854)	0.23991410374904304
  (0, 293)	0.23991410374904304
  (0, 672)	0.18646903930972664
  (0, 938)	0.44856484895246024
  (0, 142)	0.4798282074980861
  (0, 53)	0.3483837261559235
  (0, 815)	0.19161700361200087
  (0, 892)	0.23991410374904304
  (0, 414)	0.17419186307796175
  (0, 779)	0.23991410374904304
  (0, 1050)	0.23991410374904304
  (0, 545)	0.23991410374904304
  (1, 647)	0.18617822331065687
  (1, 703)	0.23310447789701255
  (1, 309)	0.15762982348385968
  (1, 66)	0.18617822331065687
  (1, 12)	0.4662089557940251
  (1, 267)	0.43583296389857457
  (1, 661)	0.18617822331065687
  (1, 886)	0.23310447789701255
  (1, 993)	0.21791648194928728
  (1, 72)	0.38390486276730146
  (1, 213)	0.23310447789701255
  (1, 695)	0.1811763767657395
  (1, 702)	0.21791648194928728
  :	:
  (365, 659)	0.4928093977943177
  (365, 526)	0.5184467929434351
  (366, 490)	0.41873273286716306
  (366,

In [11]:
y = data['label']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)

In [13]:
model = LogisticRegression()
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [14]:
y_pred = model.predict(x_test)

In [15]:
print("accuracy ", accuracy_score(y_test,y_pred))
print("classification")
print(classification_report(y_test,y_pred))
print("confusion ")
print(confusion_matrix(y_test,y_pred))

accuracy  0.8243243243243243
classification
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

confusion 
[[30  5]
 [ 8 31]]


In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("accuracy ", accuracy_score(y_test,y_pred))
print("classification")
print(classification_report(y_test,y_pred))
print("confusion ")
print(confusion_matrix(y_test,y_pred))

accuracy  0.7972972972972973
classification
              precision    recall  f1-score   support

           0       0.71      0.97      0.82        35
           1       0.96      0.64      0.77        39

    accuracy                           0.80        74
   macro avg       0.83      0.81      0.79        74
weighted avg       0.84      0.80      0.79        74

confusion 
[[34  1]
 [14 25]]


In [17]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("accuracy ", accuracy_score(y_test,y_pred))
print("classification")
print(classification_report(y_test,y_pred))
print("confusion ")
print(confusion_matrix(y_test,y_pred))

accuracy  0.8648648648648649
classification
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

confusion 
[[30  5]
 [ 5 34]]


In [18]:
from sklearn.svm import SVC
model = SVC(kernel = 'linear', random_state = 42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("accuracy ", accuracy_score(y_test,y_pred))
print("classification")
print(classification_report(y_test,y_pred))
print("confusion ")
print(confusion_matrix(y_test,y_pred))

accuracy  0.8783783783783784
classification
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

confusion 
[[31  4]
 [ 5 34]]


In [19]:
import pickle 
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl','wb'))

In [20]:
model = pickle.load(open('model.pkl','rb'))
tfidf_vectorizer= pickle.load(open('tfidf_vectorizer.pkl','rb'))

In [21]:
def detect(input_text):
    vectorized_text = tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarism Detected" if result[0]==1 else "No Plagiarism"

In [22]:
input_text  = "Researchers have discoverd a new species of butterfly in Amazon"
detect(input_text)

'Plagiarism Detected'

In [23]:
input_text  = "Playing INSTRUMENT ENHANCES CREAIVITY"
detect(input_text)

'No Plagiarism'

In [24]:
import sklearn
sklearn.__version__

'1.7.1'

In [25]:
!pip install -q sentence-transformers


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
from sentence_transformers import SentenceTransformer, util

In [28]:
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
print("Loaded")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded


In [29]:
similarities = []
for i in range(len(data)):
    emb1 = bert_model.encode(data['source_text'][i], convert_to_tensor=True)
    emb2 = bert_model.encode(data['plagiarized_text'][i], convert_to_tensor=True)
    sim = util.pytorch_cos_sim(emb1,emb2).item()
    similarities.append(sim)

data['similarity_score'] = similarities

In [30]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label,similarity_score
0,0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1,0.894752
1,1,moon orbits earth approximately 273 days,natural satellite takes around 273 days comple...,1,0.711025
2,2,water composed two hydrogen atoms one oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1,0.80622
3,3,history rome dates back 753 bc,rome long history traced back 753 bc,1,0.951375
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet suns planet...,1,0.903891


In [39]:
data['predicted_label'] =(data['similarity_score']>=0.9).astype(int)
print('score computed')
print(data[['source_text','plagiarized_text','similarity_score','predicted_label']].head())

score computed
                                         source_text  \
0  researchers discovered new species butterfly a...   
1           moon orbits earth approximately 273 days   
2  water composed two hydrogen atoms one oxygen atom   
3                     history rome dates back 753 bc   
4         pluto considered ninth planet solar system   

                                    plagiarized_text  similarity_score  \
0  scientists found previously unknown butterfly ...          0.894752   
1  natural satellite takes around 273 days comple...          0.711025   
2        h2o consists 2 hydrogen atoms 1 oxygen atom          0.806220   
3               rome long history traced back 753 bc          0.951375   
4  past pluto classified ninth planet suns planet...          0.903891   

   predicted_label  
0                0  
1                0  
2                0  
3                1  
4                1  


In [40]:
y_true = data['label']
y_pred = data['predicted_label']

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 0.5459459459459459

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.20      0.31       187
           1       0.52      0.90      0.66       183

    accuracy                           0.55       370
   macro avg       0.60      0.55      0.48       370
weighted avg       0.60      0.55      0.48       370

Confusion Matrix:
 [[ 37 150]
 [ 18 165]]


In [41]:
def detect_bert(text1, text2, threshold= 0.9):
    emb1 = bert_model.encode(text1, convert_to_tensor=True)
    emb2 = bert_model.encode(text2, convert_to_tensor=True)
    sim = util.pytorch_cos_sim(emb1, emb2).item()
    result = 'Plagiarism Detected' if sim>=threshold else "No Plagiarism"
    return result, round(sim*100,2)

In [42]:
text1 = "Researchers have discovered a new species of butterfly in the Amazon rainforest."
text2 = "Scientists have found a previously unknown butterfly species in the Amazon jungle."
result, score = detect_bert(text1, text2)
print(result, "| Similarity:", score, "%")

text1 = "Playing musical instruments enhances creativity."
text2 = "Yoga helps in maintaining physical flexibility."
result, score = detect_bert(text1, text2)
print(result, "| Similarity:", score, "%")

No Plagiarism | Similarity: 89.94 %
No Plagiarism | Similarity: 19.59 %


In [43]:
with open("bert_plagiarism_model.pkl", "wb") as f:
    pickle.dump(bert_model, f)