In [58]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  # Support Vector Classifier for SVM
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
import spacy
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from pymongo import MongoClient

In [59]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [60]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

BERT embeddings and BERT execution

In [61]:
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # Use mean pooling to convert the token embeddings into a single sentence embedding
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [62]:
data_BERT = pd.read_csv(r'C:\Users\HP\Desktop\Eitacies\Detection of Python Code presence in English sentences\Shuffled Training Data.csv', encoding='ISO-8859-1')

In [63]:
texts = data_BERT['Text'].tolist()  # Assuming 'Text' column contains text data
bert_embeddings = get_bert_embeddings(texts)

In [64]:
# Prepare labels and split data
y = data_BERT['Class'].values
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(bert_embeddings, y, test_size=0.2, random_state=42)

Preprocessing

Make text to a lowercase

In [65]:
data_normal = pd.read_csv(r'C:\Users\HP\Desktop\Eitacies\Detection of Python Code presence in English sentences\Shuffled Training Data.csv', encoding='ISO-8859-1')

In [66]:
data_normal['Text'] = data_normal['Text'].str.replace("[^a-zA-Z]", " ",regex=True).str.lower()

Remove stopwords

In [67]:
data_normal['Text'] = data_normal['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (ENGLISH_STOP_WORDS)]))

In [68]:
data_normal

Unnamed: 0,Text,Class
0,time think heart executes heart beat make race...,1
1,function def calculate area radius return radi...,1
2,book club help choose book book choice books a...,1
3,wrote little script true print love computer c...,1
4,choose movie night film random selection impor...,1
...,...,...
75,automated morning alarm play music using true ...,1
76,energy refreshing s secret,0
77,conversations favorite novel want end,0
78,manage expenses use total expenses sum expense...,1


Stemming

In [69]:
stemmer = PorterStemmer()
data_normal['Text'] = data_normal['Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [70]:
data_normal

Unnamed: 0,Text,Class
0,time think heart execut heart beat make race f...,1
1,function def calcul area radiu return radiu us...,1
2,book club help choos book book choic book avai...,1
3,wrote littl script true print love comput cons...,1
4,choos movi night film random select import ran...,1
...,...,...
75,autom morn alarm play music use true play musi...,1
76,energi refresh s secret,0
77,convers favorit novel want end,0
78,manag expens use total expens sum expens end m...,1


Lemmatization

In [71]:
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 656.4 kB/s eta 0:00:20
      --------------------------------------- 0.3/12.8 MB 2.5 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 3.6 MB/s eta 0:00:04
     --- ------------------------------------ 1.2/12.8 MB 5.3 MB/s eta 0:00:03
     ----- ---------------------------------- 1.7/12.8 MB 6.2 MB/s eta 0:00:02
     ----- ---------------------------------- 1.8/12.8 MB 5.8 MB/s eta 0:00:02
     ------- -------------------------------- 2.4/12.8 MB 6.7 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 7.1 MB/s eta 0:00:02
     ---------- ----------------------------- 3

In [72]:
data_normal['Text'] = data_normal['Text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

TF-IDF with n-gram

In [73]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(data_normal['Text']).toarray()
y = data_normal['Class'].values

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Naive Bayes

In [75]:
#without BERT

In [76]:
MNB_model = MultinomialNB()

In [77]:
#without BERT

In [78]:
MNB_model.fit(X_train, y_train)

In [81]:
#BERT does not work with Negative values

Grid Search for Single Naive Bayes

In [82]:
alphas = np.logspace(-2, 1, 20)
param_grid = {'alpha': alphas}
grid_search = GridSearchCV(estimator=MNB_model, param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Alpha: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Alpha:  {'alpha': 1.1288378916846884}
Best Score:  0.75


In [83]:
best_mnb = MultinomialNB(alpha=grid_search.best_params_['alpha'])
best_mnb.fit(X_train, y_train)

Evaluation for training data- Single Naive Bayes

In [84]:
y_train_NB_pred = best_mnb.predict(X_train)
accuracy_NB_train = accuracy_score(y_train, y_train_NB_pred)
print(f"Accuracy: {accuracy_NB_train*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_train, y_train_NB_pred))

Accuracy: 100.00%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        32

    accuracy                           1.00        64
   macro avg       1.00      1.00      1.00        64
weighted avg       1.00      1.00      1.00        64



Evaluation for test set- Single Naive Bayes

In [85]:
y_test_NB_pred = best_mnb.predict(X_test)
accuracy_NB_test = accuracy_score(y_test, y_test_NB_pred)
print(f"Accuracy: {accuracy_NB_test*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_test_NB_pred))

Accuracy: 62.50%

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.78      0.70         9
           1       0.60      0.43      0.50         7

    accuracy                           0.62        16
   macro avg       0.62      0.60      0.60        16
weighted avg       0.62      0.62      0.61        16



Cross Validation for Single Naive Bayes

In [86]:
n_folds = 5

In [87]:
scores_NB = cross_val_score(best_mnb, X, y, cv=n_folds)
print(f"Accuracy scores for each fold: {scores_NB}")
print(f"Mean cross-validation accuracy: {scores_NB.mean():.2f}")
print(f"Standard deviation of cross-validation accuracy: {scores_NB.std():.2f}")

Accuracy scores for each fold: [0.6875 0.6875 0.625  0.625  0.5625]
Mean cross-validation accuracy: 0.64
Standard deviation of cross-validation accuracy: 0.05


Bagging classifier as ensemble model for Naive Bayes

In [88]:
base_model = best_mnb
bagging_clf = BaggingClassifier(estimator=base_model, n_estimators=10, random_state=42)
bagging_clf.fit(X_train, y_train)

Evaluation for Bagging for Naive Bayes- Train

In [89]:
y_train_NB_bagging = bagging_clf.predict(X_train)
accuracy_NB_bagging_train = accuracy_score(y_train, y_train_NB_bagging)
print(f"Accuracy: {accuracy_NB_bagging_train*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_train, y_train_NB_bagging))

Accuracy: 100.00%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        32

    accuracy                           1.00        64
   macro avg       1.00      1.00      1.00        64
weighted avg       1.00      1.00      1.00        64



Evaluation for Bagging for Naive Bayes- Test

In [90]:
y_test_NB_bagging = bagging_clf.predict(X_test)
accuracy_NB_bagging_test = accuracy_score(y_test, y_test_NB_bagging)
print(f"Accuracy: {accuracy_NB_bagging_test*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_test_NB_bagging))

Accuracy: 75.00%

Classification Report:
               precision    recall  f1-score   support

           0       0.69      1.00      0.82         9
           1       1.00      0.43      0.60         7

    accuracy                           0.75        16
   macro avg       0.85      0.71      0.71        16
weighted avg       0.83      0.75      0.72        16



Logistic Regression

In [91]:
#Without BERT

In [92]:
LR_model = LogisticRegression(random_state=42, max_iter=1000)
LR_model.fit(X_train, y_train)

In [101]:
#With BERT embeddings

In [94]:
LR_model_bert = LogisticRegression(random_state=42, max_iter=1000)
LR_model_bert.fit(X_train_bert, y_train_bert)

In [102]:
#Grid search without BERT embedding

In [96]:
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],  # 'saga' solver supports all penalty types
    'class_weight': [None, 'balanced'],
    'l1_ratio': np.linspace(0, 1, 10)  # Only used with 'elasticnet' penalty
}
grid_search = GridSearchCV(estimator=LR_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)
print("Best Parameters: ", grid_search.best_params_)
print("Best Cross-Validation Score: ", grid_search.best_score_)

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits




Best Parameters:  {'C': 0.004832930238571752, 'class_weight': 'balanced', 'l1_ratio': 0.0, 'penalty': 'l2', 'solver': 'saga'}
Best Cross-Validation Score:  0.8102564102564103


In [100]:
#Grid search with BERT-based embeddings

In [99]:
param_grid_bert = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],  # 'saga' solver supports all penalty types
    'class_weight': [None, 'balanced'],
    'l1_ratio': np.linspace(0, 1, 10)  # Only used with 'elasticnet' penalty
}
grid_search_bert = GridSearchCV(estimator=LR_model_bert, param_grid=param_grid_bert, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search_bert.fit(X_train, y_train)
print("Best Parameters: ", grid_search_bert.best_params_)
print("Best Cross-Validation Score: ", grid_search_bert.best_score_)

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
Best Parameters:  {'C': 0.004832930238571752, 'class_weight': 'balanced', 'l1_ratio': 0.0, 'penalty': 'l2', 'solver': 'saga'}
Best Cross-Validation Score:  0.8102564102564103




There is no difference between BERT embeddings and non-BERT embeddings

Re-training based on grid search result

In [103]:
LR_model = LogisticRegression(
    C=0.004832930238571752,
    class_weight='balanced',  # Explicitly specifying None for class_weight
    l1_ratio=0.0,
    penalty='l2',
    solver='saga',
    max_iter=1000,  # Ensuring a high number of iterations for convergence
    random_state=42  # For reproducibility
)
LR_model.fit(X_train, y_train)



Evaluation for training data- Logistic Regression

In [104]:
y_train_LR_pred = LR_model.predict(X_train)
accuracy_LR_train = accuracy_score(y_train, y_train_LR_pred)
print(f"Accuracy: {accuracy_LR_train*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_train, y_train_LR_pred))

Accuracy: 100.00%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        32

    accuracy                           1.00        64
   macro avg       1.00      1.00      1.00        64
weighted avg       1.00      1.00      1.00        64



Evaluation for test set- Logistic Regression

In [105]:
y_test_LR_pred = LR_model.predict(X_test)
accuracy_LR_test = accuracy_score(y_test, y_test_LR_pred)
print(f"Accuracy: {accuracy_LR_test*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_test_LR_pred))

Accuracy: 68.75%

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.78      0.74         9
           1       0.67      0.57      0.62         7

    accuracy                           0.69        16
   macro avg       0.68      0.67      0.68        16
weighted avg       0.69      0.69      0.68        16



Support Vector Machine

In [106]:
#without BERT embeddings

In [107]:
SVM_model = SVC(C=0.6,random_state=42,kernel='rbf')
SVM_model.fit(X_train, y_train)

In [108]:
#with BERT embeddings

In [109]:
SVM_model_BERT = SVC(C=0.6,random_state=42,kernel='rbf')
SVM_model_BERT.fit(X_train_bert, y_train_bert)

Grid search SVM

In [110]:
parameters = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
}

In [115]:
#Grid search for BERT embeddings

In [116]:
grid_search_SVM_BERT = GridSearchCV(SVM_model_BERT, parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

In [118]:
grid_search_SVM_BERT.fit(X_train_bert, y_train_bert)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [119]:
print("Best parameters:", grid_search_SVM_BERT.best_params_)
print("Best score:", grid_search_SVM_BERT.best_score_)

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best score: 0.9846153846153847


In [111]:
#Grid search for non-BERT embeddings

In [120]:
grid_search_SVM = GridSearchCV(SVM_model, parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

In [121]:
grid_search_SVM.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [114]:
print("Best parameters:", grid_search_SVM.best_params_)
print("Best score:", grid_search_SVM.best_score_)

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best score: 0.7794871794871795


Update the model according to Grid Search

In [None]:
SVM_model = SVC(C=1,random_state=42,kernel='linear',gamma='scale')
SVM_model.fit(X_train, y_train)

Evaluation for training data- SVM

In [None]:
y_train_SVM_pred = SVM_model.predict(X_train)
accuracy_SVM_train = accuracy_score(y_train, y_train_SVM_pred)
print(f"Accuracy: {accuracy_SVM_train*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_train, y_train_SVM_pred))

Evaluation for test set- SVM

In [None]:
y_test_SVM_pred = SVM_model.predict(X_test)
accuracy_SVM_test = accuracy_score(y_test, y_test_SVM_pred)
print(f"Accuracy: {accuracy_SVM_test*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_test_SVM_pred))

Cross Validation for SVM

In [None]:
scores_SVM = cross_val_score(SVM_model, X, y, cv=n_folds)
print(f"Accuracy scores for each fold: {scores_SVM}")
print(f"Mean cross-validation accuracy: {scores_SVM.mean():.2f}")
print(f"Standard deviation of cross-validation accuracy: {scores_SVM.std():.2f}")

In [None]:
unseen_data = pd.read_csv(r'C:\Users\HP\Desktop\Eitacies\Detection of Python Code presence in English sentences\Unseen Data.csv', encoding='ISO-8859-1')

In [None]:
unseen_data['Text'] = unseen_data['Text'].str.replace("[^a-zA-Z]", " ",regex=True).str.lower()

In [None]:
unseen_data['Text'] = unseen_data['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (ENGLISH_STOP_WORDS)]))
stemmer = PorterStemmer()
unseen_data['Text'] = unseen_data['Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
unseen_data['Text'] = unseen_data['Text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

In [None]:
unseen_data_tfidf = tfidf_vectorizer.transform(unseen_data['Text']).toarray()

In [None]:
unseen_data_tfidf

Unseen data prediction with Single NB

In [None]:
new_predictions_NB = best_mnb.predict(unseen_data_tfidf)

In [None]:
accuracy_new_predictions_NB = accuracy_score(unseen_data['Class'], new_predictions_NB)
print(f"Accuracy for Single NB: {accuracy_new_predictions_NB*100:.2f}%")
print("\nClassification Report for Single NB:\n", classification_report(unseen_data['Class'], new_predictions_NB))

Unseen data prediction with Bagging NB

In [None]:
new_predictions_Bagging_NB = bagging_clf.predict(unseen_data_tfidf)
accuracy_new_predictions_Bagging_NB = accuracy_score(unseen_data['Class'], new_predictions_Bagging_NB)
print(f"Accuracy for Bagging NB: {accuracy_new_predictions_Bagging_NB*100:.2f}%")
print("\nClassification Report for Bagging NB:\n", classification_report(unseen_data['Class'], new_predictions_Bagging_NB))

In [None]:
new_predictions_LR = LR_model.predict(unseen_data_tfidf)

In [None]:
accuracy_new_predictions_LR = accuracy_score(unseen_data['Class'], new_predictions_LR)
print(f"Accuracy: {accuracy_new_predictions_LR*100:.2f}%")
print("\nClassification Report:\n", classification_report(unseen_data['Class'], new_predictions_LR))

In [None]:
new_predictions_SVM = SVM_model.predict(unseen_data_tfidf)

In [None]:
accuracy_new_predictions_SVM = accuracy_score(unseen_data['Class'], new_predictions_SVM)
print(f"Accuracy: {accuracy_new_predictions_SVM*100:.2f}%")
print("\nClassification Report:\n", classification_report(unseen_data['Class'], new_predictions_SVM))

In [None]:
new_predictions_SVM

Random Forest classifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=1000, random_state=42)

In [None]:
rf_classifier.fit(X_train, y_train)

In [None]:
rf_predictions = rf_classifier.predict(X_test)

In [None]:
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f'Random Forest Accuracy: {rf_accuracy}')

In [None]:
new_predictions_RF = rf_classifier.predict(unseen_data_tfidf)

In [None]:
accuracy_new_predictions_RF = accuracy_score(unseen_data['Class'], new_predictions_RF)
print(f"Accuracy: {accuracy_new_predictions_RF*100:.2f}%")
print("\nClassification Report:\n", classification_report(unseen_data['Class'], new_predictions_RF))