In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score

In [2]:
ENCRYPTED_DIRECTORY = "data/govdocs_encrypted"
PLAINTEXT_DIRECTORY = "data/govdocs_plaintext"

In [3]:
def load_data(directory):
    dataframes = list()
    for f in glob.glob(f"{directory}/**.parquet.gzip"):
        df = pd.read_parquet(f)
        dataframes.append(df)
    return pd.concat(dataframes)

df_plaintext = load_data(PLAINTEXT_DIRECTORY)
df_plaintext["is_encrypted"] = 0
df_encrypted = load_data(ENCRYPTED_DIRECTORY)
df_encrypted["is_encrypted"] = 1

master_df = pd.concat([df_plaintext, df_encrypted])

# Filter out only webp
#master_df = master_df[master_df['extended.extension'] == '.webp']
#print("only webp: ", master_df.shape)

# Shuffle
master_df = master_df.sample(frac=1).reset_index(drop=True)

# fill Nan with 0
master_df['baseline.chisquare_end'] = master_df['baseline.chisquare_end'].fillna(0.0)
master_df.dropna(inplace=True)

print(master_df.shape)
print(master_df.columns)


(154849, 19)
Index(['extended.extension', 'baseline.head_shannon_entropy',
       'baseline.tail_shannon_entropy', 'baseline.shannon_entropy',
       'baseline.montecarlo_pi', 'baseline.chisquare_full',
       'baseline.chisquare_begin', 'baseline.chisquare_end',
       'baseline.autocorrelation_full', 'baseline.autocorrelation_begin',
       'baseline.autocorrelation_end', 'baseline.filesize',
       'advanced.kurtosis_full', 'advanced.kurtosis_end',
       'advanced.kurtosis_begin', 'advanced.skew_full', 'advanced.skew_begin',
       'advanced.skew_end', 'is_encrypted'],
      dtype='object')


In [4]:
#columns_to_consider = [c for c in master_df.columns]
columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
columns_to_consider.append('is_encrypted')
#columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
estimators = [('std,', StandardScaler()), ('LogisticRegressor', LogisticRegression())]
estimators = [('std,', StandardScaler()), ('RFC', RandomForestClassifier())]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
print(precision, recall, fscore, support)
print(f"F1 = {f1_score(y_test, y_pred)}")

[0.7280643  0.78222644] [0.78761774 0.72169542] [0.75667105 0.75074278] [24842 26259]
F1 = 0.7507427801766827


In [6]:
#df1 = master_df[X_train.isna().any(axis=1)]
#print(df1)
print(X_train.describe().T)

                                   count          mean           std  \
baseline.head_shannon_entropy   103748.0  6.130503e+00  1.433917e+00   
baseline.tail_shannon_entropy   103748.0  6.235085e+00  1.533775e+00   
baseline.shannon_entropy        103748.0  7.345662e+00  1.263632e+00   
baseline.montecarlo_pi          103748.0  3.111838e+00  4.150980e-01   
baseline.chisquare_full         103748.0  1.344052e+07  7.722609e+07   
baseline.chisquare_begin        103748.0  5.878226e+03  3.446648e+03   
baseline.chisquare_end          103748.0  5.333473e+03  3.836814e+03   
baseline.autocorrelation_full   103748.0  9.596022e-02  1.771501e-01   
baseline.autocorrelation_begin  103748.0  2.126721e-01  2.527107e-01   
baseline.autocorrelation_end    103748.0  1.373303e-01  2.387487e-01   
baseline.filesize               103748.0  3.216961e+05  1.677556e+06   

                                      min            25%           50%  \
baseline.head_shannon_entropy    0.116115       5.188624  6.9