In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score

In [2]:
ENCRYPTED_DIRECTORY = "data/govdocs_encrypted"
PLAINTEXT_DIRECTORY = "data/govdocs_plaintext"

In [3]:
def load_data(directory):
    dataframes = list()
    for f in glob.glob(f"{directory}/**.csv.gz"):
        df = pd.read_csv(f)
        dataframes.append(df)
    return pd.concat(dataframes)

df_plaintext = load_data(PLAINTEXT_DIRECTORY)
df_plaintext["is_encrypted"] = 0
df_encrypted = load_data(ENCRYPTED_DIRECTORY)
df_encrypted["is_encrypted"] = 1

master_df = pd.concat([df_plaintext, df_encrypted])

# Filter out only webp
# master_df = master_df[master_df['extended.extension'] != '.webp']
print("only webp: ", master_df.shape)

# Filter out files which are larger than 4k
#master_df = master_df[master_df['baseline.filesize'] > 4096]
print("greater than 4096: ", master_df.shape)

# Shuffle
master_df = master_df.sample(frac=1).reset_index(drop=True)

# fill Nan with 0
master_df['baseline.chisquare_end'] = master_df['baseline.chisquare_end'].fillna(0.0)
master_df.dropna(inplace=True)


  df_plaintext["is_encrypted"] = 0
  df_encrypted["is_encrypted"] = 1


only webp:  (160507, 528)
greater than 4096:  (160507, 528)


In [4]:
#columns_to_consider = [c for c in master_df.columns]
#columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
#columns_to_consider.append('is_encrypted')
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# ---------------------------------------------------------------

rfc = RandomForestClassifier(n_jobs=-1)
estimators = [('std,', MinMaxScaler()), ('LogisticRegressor', LogisticRegression(n_jobs=10, max_iter=1000))]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")

df_json = {
    "Extremities_Measured_Separately": [],
    "Algorithm": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1": []
}

df_json["Extremities_Measured_Separately"].append("NO")
df_json["Algorithm"].append("Logistic Regression")
df_json["Accuracy"].append(accuracy)
df_json["Precision"].append(precision)
df_json["Recall"].append(recall)
df_json["F1"].append(f1)

Precision : 0.6894083969465649
Recall    : 0.775006704210244
Accuracy  : 0.7138083143964711
F1-score  : 0.7297058452215629


## Now try the same thing with extremity measurements separately

In [5]:
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']

interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
# X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
# X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

rfc = RandomForestClassifier(n_jobs=-1)
estimators = [('std,', MinMaxScaler()), ('LogisticRegressor', LogisticRegression(n_jobs=10,  max_iter=1000))]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")


df_json["Extremities_Measured_Separately"].append("YES")
df_json["Algorithm"].append("Logistic Regression")
df_json["Accuracy"].append(accuracy)
df_json["Precision"].append(precision)
df_json["Recall"].append(recall)
df_json["F1"].append(f1)

Precision : 0.7941059865204176
Recall    : 0.8057119871279164
Accuracy  : 0.7990241946263869
F1-score  : 0.7998668885191348


In [6]:
#columns_to_consider = [c for c in master_df.columns]
#columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
#columns_to_consider.append('is_encrypted')
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# -----------------------------------


rfc = RandomForestClassifier(n_jobs=-1)
estimators = [('std,', MinMaxScaler()), ('RFC', rfc)]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")

df_json["Extremities_Measured_Separately"].append("NO")
df_json["Algorithm"].append("Random Forest")
df_json["Accuracy"].append(accuracy)
df_json["Precision"].append(precision)
df_json["Recall"].append(recall)
df_json["F1"].append(f1)

Precision : 0.7000973709834469
Recall    : 0.7712523464735854
Accuracy  : 0.7212939446598048
F1-score  : 0.7339543192548168


In [8]:
#columns_to_consider = [c for c in master_df.columns]
#columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
#columns_to_consider.append('is_encrypted')
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
#X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
#X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# -----------------------------------


rfc = RandomForestClassifier(n_jobs=-1)
estimators = [('std,', MinMaxScaler()), ('RFC', rfc)]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")

df_json["Extremities_Measured_Separately"].append("YES")
df_json["Algorithm"].append("Random Forest")
df_json["Accuracy"].append(accuracy)
df_json["Precision"].append(precision)
df_json["Recall"].append(recall)
df_json["F1"].append(f1)

Precision : 0.8209750102417043
Recall    : 0.8061142397425584
Accuracy  : 0.8157331907498997
F1-score  : 0.8134767607063123


In [18]:
df = pd.DataFrame(df_json)


In [20]:
df

Unnamed: 0,Extremities_Measured_Separately,Algorithm,Accuracy,Precision,Recall,F1
0,NO,Logistic Regression,0.713808,0.689408,0.775007,0.729706
1,YES,Logistic Regression,0.799024,0.794106,0.805712,0.799867
2,NO,Random Forest,0.721294,0.700097,0.771252,0.733954
3,YES,Random Forest,0.815733,0.820975,0.806114,0.813477


In [19]:
print(df.to_latex())

\begin{tabular}{lllrrrr}
\toprule
{} & Extremities\_Measured\_Separately &            Algorithm &  Accuracy &  Precision &    Recall &        F1 \\
\midrule
0 &                              NO &  Logistic Regression &  0.713808 &   0.689408 &  0.775007 &  0.729706 \\
1 &                             YES &  Logistic Regression &  0.799024 &   0.794106 &  0.805712 &  0.799867 \\
2 &                              NO &        Random Forest &  0.721294 &   0.700097 &  0.771252 &  0.733954 \\
3 &                             YES &        Random Forest &  0.815733 &   0.820975 &  0.806114 &  0.813477 \\
\bottomrule
\end{tabular}



### Now add kurtosis and skew to the same thing and measure again

In [23]:
#columns_to_consider = [c for c in master_df.columns]
#columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
#columns_to_consider.append('is_encrypted')
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

# X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# ---------------------------------------------------------------

rfc = RandomForestClassifier(n_jobs=-1)
estimators = [('std,', MinMaxScaler()), ('LogisticRegressor', LogisticRegression(n_jobs=10, max_iter=1000))]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")

df_json2 = {
    "Extremities_Measured_Separately": [],
    "Algorithm": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1": []
}

df_json2["Extremities_Measured_Separately"].append("NO")
df_json2["Algorithm"].append("Logistic Regression")
df_json2["Accuracy"].append(accuracy)
df_json2["Precision"].append(precision)
df_json2["Recall"].append(recall)
df_json2["F1"].append(f1)

Precision : 0.6901694106418516
Recall    : 0.7756771252346474
Accuracy  : 0.7146103462103996
F1-score  : 0.7304292929292929


In [24]:
#columns_to_consider = [c for c in master_df.columns]
#columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
#columns_to_consider.append('is_encrypted')
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
# X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
# X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

# X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# ---------------------------------------------------------------

rfc = RandomForestClassifier(n_jobs=-1)
estimators = [('std,', MinMaxScaler()), ('LogisticRegressor', LogisticRegression(n_jobs=10, max_iter=1000))]
#estimators = [('std,', MinMaxScaler()), ('RFC', rfc)]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")


df_json2["Extremities_Measured_Separately"].append("YES")
df_json2["Algorithm"].append("Logistic Regression")
df_json2["Accuracy"].append(accuracy)
df_json2["Precision"].append(precision)
df_json2["Recall"].append(recall)
df_json2["F1"].append(f1)

print([c for c in X.columns])

Precision : 0.801762114537445
Recall    : 0.8053097345132744
Accuracy  : 0.8037027135409704
F1-score  : 0.8035320088300221
['Unnamed: 0', 'baseline.head_shannon_entropy', 'baseline.tail_shannon_entropy', 'baseline.shannon_entropy', 'baseline.montecarlo_pi', 'baseline.chisquare_full', 'baseline.chisquare_begin', 'baseline.chisquare_end', 'baseline.autocorrelation_full', 'baseline.autocorrelation_begin', 'baseline.autocorrelation_end', 'baseline.filesize', 'advanced.kurtosis_full', 'advanced.kurtosis_end', 'advanced.kurtosis_begin', 'advanced.skew_full', 'advanced.skew_begin', 'advanced.skew_end', 'fourier.stat.1byte.autocorr', 'fourier.stat.1byte.mean', 'fourier.stat.1byte.std', 'fourier.value.1byte.0', 'fourier.value.1byte.1', 'fourier.value.1byte.2', 'fourier.value.1byte.3', 'fourier.value.1byte.4', 'fourier.value.1byte.5', 'fourier.value.1byte.6', 'fourier.value.1byte.7', 'fourier.value.1byte.8', 'fourier.value.1byte.9', 'fourier.value.1byte.10', 'fourier.value.1byte.11', 'fourier.va

In [25]:
#columns_to_consider = [c for c in master_df.columns]
#columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
#columns_to_consider.append('is_encrypted')
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

# X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# ---------------------------------------------------------------

rfc = RandomForestClassifier(n_jobs=-1)
#estimators = [('std,', MinMaxScaler()), ('LogisticRegressor', LogisticRegression(n_jobs=10, max_iter=1000))]
estimators = [('std,', MinMaxScaler()), ('RFC', rfc)]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")

df_json2["Extremities_Measured_Separately"].append("NO")
df_json2["Algorithm"].append("Random Forest")
df_json2["Accuracy"].append(accuracy)
df_json2["Precision"].append(precision)
df_json2["Recall"].append(recall)
df_json2["F1"].append(f1)

Precision : 0.7005360623781677
Recall    : 0.7709841780638241
Accuracy  : 0.721561288597781
F1-score  : 0.7340737903740585


In [26]:
#columns_to_consider = [c for c in master_df.columns]
#columns_to_consider = [c for c in master_df.columns if c.startswith("baseline")]
#columns_to_consider.append('is_encrypted')
columns_to_consider = [c for c in master_df.columns if c != 'extended.extension']


interesting_df = master_df[columns_to_consider]

X = interesting_df[[c for c in interesting_df.columns if c.startswith('baseline')]]
X = interesting_df[[c for c in interesting_df.columns if c != 'is_encrypted']]
X = X[[c for c in X.columns if c != 'is_encrypted']]

# get rid of tail and head metrics
# X = X[[c for c in X.columns if "begin" not in c and "head" not in c]]
# X = X[[c for c in X.columns if "tail" not in c and "end" not in c]]

# X = X[[c for c in X.columns if "kurtosis" not in c and "skew" not in c]]

y = interesting_df['is_encrypted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# ---------------------------------------------------------------

rfc = RandomForestClassifier(n_jobs=-1)
#estimators = [('std,', MinMaxScaler()), ('LogisticRegressor', LogisticRegression(n_jobs=10, max_iter=1000))]
estimators = [('std,', MinMaxScaler()), ('RFC', rfc)]
pipeline = Pipeline(estimators)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision : {precision}")
print(f"Recall    : {recall}")
print(f"Accuracy  : {accuracy}")
print(f"F1-score  : {f1}")

df_json2["Extremities_Measured_Separately"].append("YES")
df_json2["Algorithm"].append("Random Forest")
df_json2["Accuracy"].append(accuracy)
df_json2["Precision"].append(precision)
df_json2["Recall"].append(recall)
df_json2["F1"].append(f1)

Precision : 0.8182314410480349
Recall    : 0.8039688924644677
Accuracy  : 0.8132602593236198
F1-score  : 0.8110374678750168


In [27]:
df2 = pd.DataFrame(df_json2)
df2

Unnamed: 0,Extremities_Measured_Separately,Algorithm,Accuracy,Precision,Recall,F1
0,NO,Logistic Regression,0.71461,0.690169,0.775677,0.730429
1,YES,Logistic Regression,0.803703,0.801762,0.80531,0.803532
2,NO,Random Forest,0.721561,0.700536,0.770984,0.734074
3,YES,Random Forest,0.81326,0.818231,0.803969,0.811037


In [28]:
print(df2.to_latex())

\begin{tabular}{lllrrrr}
\toprule
{} & Extremities\_Measured\_Separately &            Algorithm &  Accuracy &  Precision &    Recall &        F1 \\
\midrule
0 &                              NO &  Logistic Regression &  0.714610 &   0.690169 &  0.775677 &  0.730429 \\
1 &                             YES &  Logistic Regression &  0.803703 &   0.801762 &  0.805310 &  0.803532 \\
2 &                              NO &        Random Forest &  0.721561 &   0.700536 &  0.770984 &  0.734074 \\
3 &                             YES &        Random Forest &  0.813260 &   0.818231 &  0.803969 &  0.811037 \\
\bottomrule
\end{tabular}



In [29]:
df

Unnamed: 0,Extremities_Measured_Separately,Algorithm,Accuracy,Precision,Recall,F1
0,NO,Logistic Regression,0.713808,0.689408,0.775007,0.729706
1,YES,Logistic Regression,0.799024,0.794106,0.805712,0.799867
2,NO,Random Forest,0.721294,0.700097,0.771252,0.733954
3,YES,Random Forest,0.815733,0.820975,0.806114,0.813477


In [30]:
print(df.to_latex())

\begin{tabular}{lllrrrr}
\toprule
{} & Extremities\_Measured\_Separately &            Algorithm &  Accuracy &  Precision &    Recall &        F1 \\
\midrule
0 &                              NO &  Logistic Regression &  0.713808 &   0.689408 &  0.775007 &  0.729706 \\
1 &                             YES &  Logistic Regression &  0.799024 &   0.794106 &  0.805712 &  0.799867 \\
2 &                              NO &        Random Forest &  0.721294 &   0.700097 &  0.771252 &  0.733954 \\
3 &                             YES &        Random Forest &  0.815733 &   0.820975 &  0.806114 &  0.813477 \\
\bottomrule
\end{tabular}



In [32]:
df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Accuracy,4.0,0.763284,0.052412,0.71461,0.719824,0.762632,0.806092,0.81326
Precision,4.0,0.752675,0.066665,0.690169,0.697944,0.751149,0.805879,0.818231
Recall,4.0,0.788985,0.018186,0.770984,0.774504,0.789823,0.804304,0.80531
F1,4.0,0.769768,0.043454,0.730429,0.733163,0.768803,0.805408,0.811037


In [33]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Accuracy,4.0,0.762465,0.052398,0.713808,0.719423,0.760159,0.803201,0.815733
Precision,4.0,0.751147,0.066179,0.689408,0.697425,0.747102,0.800823,0.820975
Recall,4.0,0.789521,0.01899,0.771252,0.774068,0.790359,0.805813,0.806114
F1,4.0,0.769251,0.0436,0.729706,0.732892,0.766911,0.803269,0.813477


In [34]:
df3 = pd.DataFrame({"With Skew and Kurtosis (averages)": df2.mean(), "Without Skew and Kurtosis (averages)": df.mean()}).T

  df3 = pd.DataFrame({"With Skew and Kurtosis (averages)": df2.mean(), "Without Skew and Kurtosis (averages)": df.mean()}).T


In [35]:
df3

Unnamed: 0,Accuracy,Precision,Recall,F1
With Skew and Kurtosis (averages),0.763284,0.752675,0.788985,0.769768
Without Skew and Kurtosis (averages),0.762465,0.751147,0.789521,0.769251


In [36]:
print(df3.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  Accuracy &  Precision &    Recall &        F1 \\
\midrule
With Skew and Kurtosis (averages)    &  0.763284 &   0.752675 &  0.788985 &  0.769768 \\
Without Skew and Kurtosis (averages) &  0.762465 &   0.751147 &  0.789521 &  0.769251 \\
\bottomrule
\end{tabular}



In [38]:
df.to_csv("extremity-compared-without-kurtosis.csv")
df2.to_csv("extremity-compared-with-kurtosis.csv")
df3.to_csv("with-and-without-kurtosis.csv")

In [40]:
df["Kurtosis and Skew Used"] = 0
df2["Kurtosis and Skew Used"] = 1

In [43]:
df_final = pd.concat([df, df2])

In [59]:
df_final["Extremities Measured Separately"] = df_final["Extremities_Measured_Separately"]
df_final = df_final.drop("Extremities_Measured_Separately", axis=1)

KeyError: 'Extremities_Measured_Separately'

In [74]:
df_final = df_final.reset_index()

ValueError: cannot insert level_0, already exists

In [68]:
df_final.to_latex()

'\\begin{tabular}{llrrrrrl}\n\\toprule\n{} &            Algorithm &  Accuracy &  Precision &    Recall &        F1 &  Kurtosis and Skew Used & Extremities Measured Separately \\\\\n\\midrule\n0 &  Logistic Regression &  0.713808 &   0.689408 &  0.775007 &  0.729706 &                       0 &                              NO \\\\\n1 &  Logistic Regression &  0.799024 &   0.794106 &  0.805712 &  0.799867 &                       0 &                             YES \\\\\n2 &        Random Forest &  0.721294 &   0.700097 &  0.771252 &  0.733954 &                       0 &                              NO \\\\\n3 &        Random Forest &  0.815733 &   0.820975 &  0.806114 &  0.813477 &                       0 &                             YES \\\\\n0 &  Logistic Regression &  0.714610 &   0.690169 &  0.775677 &  0.730429 &                       1 &                              NO \\\\\n1 &  Logistic Regression &  0.803703 &   0.801762 &  0.805310 &  0.803532 &                       1 &       

In [76]:
df_final = df_final.drop("index", axis=1)

In [78]:
df_final = df_final.drop("level_0", axis=1)

In [79]:
df_final

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1,Kurtosis and Skew Used,Extremities Measured Separately
0,Logistic Regression,0.713808,0.689408,0.775007,0.729706,0,NO
1,Logistic Regression,0.799024,0.794106,0.805712,0.799867,0,YES
2,Random Forest,0.721294,0.700097,0.771252,0.733954,0,NO
3,Random Forest,0.815733,0.820975,0.806114,0.813477,0,YES
4,Logistic Regression,0.71461,0.690169,0.775677,0.730429,1,NO
5,Logistic Regression,0.803703,0.801762,0.80531,0.803532,1,YES
6,Random Forest,0.721561,0.700536,0.770984,0.734074,1,NO
7,Random Forest,0.81326,0.818231,0.803969,0.811037,1,YES


In [81]:
print(df_final.to_latex())

\begin{tabular}{llrrrrrl}
\toprule
{} &            Algorithm &  Accuracy &  Precision &    Recall &        F1 &  Kurtosis and Skew Used & Extremities Measured Separately \\
\midrule
0 &  Logistic Regression &  0.713808 &   0.689408 &  0.775007 &  0.729706 &                       0 &                              NO \\
1 &  Logistic Regression &  0.799024 &   0.794106 &  0.805712 &  0.799867 &                       0 &                             YES \\
2 &        Random Forest &  0.721294 &   0.700097 &  0.771252 &  0.733954 &                       0 &                              NO \\
3 &        Random Forest &  0.815733 &   0.820975 &  0.806114 &  0.813477 &                       0 &                             YES \\
4 &  Logistic Regression &  0.714610 &   0.690169 &  0.775677 &  0.730429 &                       1 &                              NO \\
5 &  Logistic Regression &  0.803703 &   0.801762 &  0.805310 &  0.803532 &                       1 &                             YES

In [82]:
df_final.to_csv("extremity-kurtosis-skew-measured.csv")