USING THE FEATURES - COMMENTS, FILES_CHANGES, LINES_ADDED, LINES_REMOVED

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load data
data = pd.read_csv('data_csv.csv')  # replace 'your_data.csv' with your actual data path

data


Unnamed: 0,title,comments,files_changed,lines_added,lines_removed,label
0,Update README.md,2,1,1,0,spam
1,Update README.md,2,1,1,1,spam
2,Change the README.md and SECURITY.md,4,2,4,0,spam
3,Readme File Modified,2,1,1,1,spam
4,Update README.md,3,1,2,0,spam
...,...,...,...,...,...,...
173,Improve performance of data exports,8,4,45,12,not_spam
174,Document security best practices,4,1,15,2,not_spam
175,Fix loading spinner alignment,3,1,5,2,not_spam
176,Update CI/CD pipeline with new steps,6,2,20,5,not_spam


In [2]:
# Check for missing values
data.isnull().sum()

# Encode the target variable
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])  # 0 for 'spam', 1 for 'not_spam'

# Select features and target
X = data[['comments', 'files_changed', 'lines_added', 'lines_removed']]
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [3]:
# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [4]:
# Predict on test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.8888888888888888
Precision: 0.8
Recall: 0.9230769230769231
F1 Score: 0.8571428571428571
Confusion Matrix:
 [[20  3]
 [ 1 12]]


USING LOGISTIC REGRESSION

In [5]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)

# Predict on test data
y_pred_logistic = logistic_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_logistic)
precision = precision_score(y_test, y_pred_logistic)
recall = recall_score(y_test, y_pred_logistic)
f1 = f1_score(y_test, y_pred_logistic)
conf_matrix = confusion_matrix(y_test, y_pred_logistic)

# Print results
print("Logistic Regression Results:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Logistic Regression Results:
Accuracy: 0.6944444444444444
Precision: 0.6
Recall: 0.46153846153846156
F1 Score: 0.5217391304347826
Confusion Matrix:
 [[19  4]
 [ 7  6]]


USING THE FEATURES - TITLE, COMMENTS, FILES_CHANGES, LINES_ADDED, LINES_REMOVED

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

# Load data
data = pd.read_csv('data_csv.csv')  # Replace with your actual file path

data

Unnamed: 0,title,comments,files_changed,lines_added,lines_removed,label
0,Update README.md,2,1,1,0,spam
1,Update README.md,2,1,1,1,spam
2,Change the README.md and SECURITY.md,4,2,4,0,spam
3,Readme File Modified,2,1,1,1,spam
4,Update README.md,3,1,2,0,spam
...,...,...,...,...,...,...
173,Improve performance of data exports,8,4,45,12,not_spam
174,Document security best practices,4,1,15,2,not_spam
175,Fix loading spinner alignment,3,1,5,2,not_spam
176,Update CI/CD pipeline with new steps,6,2,20,5,not_spam


In [7]:
# Encode the target variable
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

In [11]:
# TF-IDF encoding for the 'title' feature
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Adjust max_features based on your data
tfidf_vectorizer
title_tfidf = tfidf_vectorizer.fit_transform(data['title']).toarray()
title_tfidf


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.62692719, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.31519956],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [12]:
# Combine TF-IDF features with existing numerical features
numerical_features = data[['comments', 'files_changed', 'lines_added', 'lines_removed']]
X = pd.concat([pd.DataFrame(title_tfidf), numerical_features.reset_index(drop=True)], axis=1)
y = data['label']

In [13]:
# Convert all column names to strings
X.columns = X.columns.astype(str)

# Then split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_test


array([[-0.11807718,  1.86966598, -0.16839566, ..., -0.01882034,
        -0.53547627, -1.06688127],
       [-0.11807718, -0.41130228, -0.16839566, ..., -0.01882034,
        -1.10558035, -0.46490588],
       [-0.11807718, -0.41130228, -0.16839566, ..., -0.90964977,
        -0.03663521, -0.66556434],
       ...,
       [-0.11807718, -0.41130228, -0.16839566, ..., -0.90964977,
        -0.89179132, -0.86622281],
       [-0.11807718, -0.41130228, -0.16839566, ..., -0.01882034,
        -0.39295026, -0.86622281],
       [-0.11807718,  2.39555046, -0.16839566, ..., -0.01882034,
        -0.53547627, -1.06688127]])

In [14]:
# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# trying user input
X_test_custom = pd.DataFrame([[1, 10, 20, 5]])
X_test_custom = pd.concat([pd.DataFrame(tfidf_vectorizer.transform(['Readme update']).toarray()), X_test_custom], axis=1)
X_test_custom = scaler.transform(X_test_custom)
y_pred_custom = model.predict(X_test_custom)

y_pred_custom.item()



0

In [15]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8888888888888888
Precision: 0.8461538461538461
Recall: 0.8461538461538461
F1 Score: 0.8461538461538461
Confusion Matrix:
 [[21  2]
 [ 2 11]]


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Step 1: Load the data
df = pd.read_csv("data_csv.csv")  # Replace 'data.csv' with your data file

# Separate features and target
X = df.drop("label", axis=1)
y = df["label"]

# Step 2: Handle categorical and text features
# TF-IDF vectorization for 'title' column
tfidf = TfidfVectorizer(max_features=100)  # adjust max_features based on vocabulary size

# Step 3: Column transformer to handle different types of preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("title_tfidf", tfidf, "title"),  # Apply TF-IDF to the 'title' column
        ("num", StandardScaler(), ["comments", "files_changed", "lines_added", "lines_removed"])
    ],
    remainder="drop"  # drop columns that are not explicitly mentioned
)

# Step 4: Create a pipeline with preprocessing and the model
pipeline = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))

# Step 5: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the model
pipeline.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = pipeline.predict(X_test)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="spam")
recall = recall_score(y_test, y_pred, pos_label="spam")
f1 = f1_score(y_test, y_pred, pos_label="spam")
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.8611111111111112
Precision: 0.7857142857142857
Recall: 0.8461538461538461
F1 Score: 0.8148148148148148
Confusion Matrix:
 [[20  3]
 [ 2 11]]
