In [9]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-win_amd64.whl (7.1 MB)
     ---------------------------------------- 7.1/7.1 MB 12.0 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
     ---------------------------------------- 302.2/302.2 kB ? eta 0:00:00
Collecting scipy>=1.1.0
  Downloading scipy-1.7.3-cp37-cp37m-win_amd64.whl (34.1 MB)
     --------------------------------------- 34.1/34.1 MB 72.5 MB/s eta 0:00:00
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.0.2 scipy-1.7.3 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [14]:
import numpy as np
import pandas as pd
import re
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pickle

In [12]:
# in this file, we will implement a model to distinguish between C code and Plain text.
# we will use the SVC model to classify the code snippets.
# In the training data, we have 2 classes: 0 for Plain text and 1 for C code.
# To train the data we will extract the features from the code snippets.
# The features we will use are:
# 1. Number of lines (will be used to calculate other features)
# 2. Average line length
# 3. Number of semicolons
# 4. Number of special characters, such as {}, (), [], #, /, \, +, -, *, %, =
# 5. Number of keywords such as if, else, for, while, do, break, continue, default, return, int, char, float
# 6. Number of comments i.e. lines starting with // or containing /* or */
# 7. Ratio of numeric values to the number of words

import numpy as np
import pandas as pd
import re
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


def extract_features(data: list) -> list:
    features = []
    for snippet in data:
        lines = snippet.split('\n')
        num_lines = len(lines)
        avg_line_length = sum([len(line) for line in lines]) / num_lines
        num_semicolons = sum([line.count(';') for line in lines])
        num_special_chars = sum([len(re.findall(r'[{}()\[\]#\\+*/%=]', line)) for line in lines])
        num_keywords = sum([len(re.findall(r'\b(if|else|for|while|do|break|continue|default|return|int|char|float|void)\b', line)) for line in lines])
        num_comments = sum([1 for line in lines if line.startswith('//') or '/*' in line or '*/' in line])
        num_numeric_values = sum([len(re.findall(r'\b\d+\b', line)) for line in lines])
        num_words = sum([len(re.findall(r'\b\w+\b', line)) for line in lines])
        ratio_numeric_words = num_numeric_values / num_words if num_words > 0 else 0
        features.append([num_lines, avg_line_length, num_semicolons, num_special_chars, num_keywords, num_comments, ratio_numeric_words])
    return features

# Function that will read the data from the file and return the data in the form of a list
def read_data(file_path: str, delimiter: str) -> list:
    with open(file_path, 'r', errors='ignore') as file:
        data = file.read().split(delimiter)
    return data


# Read the data from the files
c_data = read_data('combined_code.txt', delimiter='THISISENDOFCODE')
text_data = read_data('aggregated_text.txt', delimiter='THISISENDOFENTRY')

# Create labels for the data
c_labels = [1] * len(c_data)
text_labels = [0] * len(text_data)

# Combine the data and labels
data = c_data + text_data
labels = c_labels + text_labels
print(f"Total number of snippets: {len(data)}")
print("generating features...")
features = np.array(extract_features(data))


Total number of snippets: 99186
generating features...


In [13]:
#write to features.txt file all the features vectors and the label one each line
with open('features.txt', 'w') as file:
    for i in range(len(features)):
        file.write(f"{features[i][0]} {features[i][1]} {features[i][2]} {features[i][3]} {features[i][4]} {features[i][5]} {features[i][6]} {labels[i]}\n")

In [None]:
# load the features from the file. each line is a feature where the last element is the label and the rest are the features
with open('features.txt', 'r') as file:
    features = []
    labels = []
    for line in file:
        line = line.strip()
        features.append([float(x) for x in line.split()[:-1]])
        labels.append(int(line.split()[-1]))


In [15]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels)

model = SVC(class_weight='balanced', random_state=42)
# x_combined = np.vstack((x_train, x_test))
# y_combined = np.concatenate((y_train, y_test))
print("Training the model...")
model.fit(x_train, y_train)

print("Model trained successfully.")

# Save the model
filename = 'tester_from_extractor.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

# Evaluate the model
y_pred = loaded_model.predict(x_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


Training the model...
Model trained successfully.
Confusion Matrix:
[[19521    26]
 [    2   289]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19547
           1       0.92      0.99      0.95       291

    accuracy                           1.00     19838
   macro avg       0.96      1.00      0.98     19838
weighted avg       1.00      1.00      1.00     19838

Accuracy: 0.9985885673959068
