In [35]:
#imports
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import unicodedata
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import env
import acquire, prepare
import requests as req
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

### Acquire 

In [36]:
#acquire github data on repos referencing 470 cyber security repositories
# scrape = acquire.scrape_github_data()

In [37]:
# len(scrape)

In [38]:
# df = pd.DataFrame(scrape)
# df

In [39]:
# data = df.to_csv('repo_readmes.csv')
df = pd.read_csv('repo_readmes.csv', usecols=['repo','language','readme_contents'])

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             470 non-null    object
 1   language         305 non-null    object
 2   readme_contents  395 non-null    object
dtypes: object(3)
memory usage: 11.1+ KB


### Prepare

In [41]:
df = df[df.language.notnull()]
df = df[df.readme_contents.notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254 entries, 5 to 469
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             254 non-null    object
 1   language         254 non-null    object
 2   readme_contents  254 non-null    object
dtypes: object(3)
memory usage: 7.9+ KB


In [42]:
# Dropped languages with only 1 observation
# Dropped 165 nulls in language
df.value_counts('language')

language
Python              80
Jupyter Notebook    36
HTML                26
Java                19
Shell               17
JavaScript          14
CSS                 11
C                    6
C++                  6
PHP                  5
C#                   5
TeX                  4
PowerShell           3
Dart                 2
R                    2
Ruby                 2
Dockerfile           2
Pug                  2
Batchfile            1
Go                   1
Verilog              1
HCL                  1
Haxe                 1
TypeScript           1
Kotlin               1
Objective-C          1
Ren'Py               1
SCSS                 1
Scala                1
Assembly             1
dtype: int64

In [43]:
#limit dataframe to the top 6 languages
top_6_languages = df.language.value_counts().index[0:6]

In [44]:
df = df[df.language.isin(top_6_languages)]
df

Unnamed: 0,repo,language,readme_contents
5,DerekBabb/CyberSecurity,Java,# Cyber Security\n### A curriculum for a high ...
7,PacktPublishing/Machine-Learning-for-Cybersecu...,Jupyter Notebook,# Machine Learning for Cybersecurity Cookbook ...
8,llSourcell/Build-a-Cybersecurity-Startup,JavaScript,# Overview\n\nThis is the code for [this](http...
15,scusec/Data-Mining-for-Cybersecurity,HTML,# Data-Mining-for-Cybersecurity\n\n本项目主要是课程《Da...
16,guidesmiths/cybersecurity-handbook,JavaScript,# Cybersecurity handbook\n\n![Cover image](pub...
...,...,...,...
461,jonathan6661/P1sty,Python,"# P1sty\n\n<p align=""center"">\n<img src=""https..."
464,Patrowl/PatrowlHears,Python,![](https://github.com/Patrowl/PatrowlDocs/blo...
466,tropicoo/zoneh,Python,# zoneh\nZone-H cybercrime archive monitoring ...
467,diogo-fernan/domfind,Python,# *domfind*\n\n*domfind* is a Python 3.6.x uti...


In [45]:
# # prepare the data by applying the clean function
# df.readme_contents = df.readme_contents.apply(prepare.clean)
# df

In [46]:
def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words]

In [47]:
#clean to hold the normalized and tokenized original with the stopwords removed.
df['clean'] = df['readme_contents'].apply(lambda x: prepare.remove_stopwords(prepare.tokenize(prepare.basic_clean(x))))
#stemmed to hold the stemmed version of the cleaned data.
df['stemmed'] = df['clean'].apply(lambda x: prepare.stem(x))
#lemmatized to hold the lemmatized version of the cleaned data.
df['lemmatized'] = df['clean'].apply(lambda x: prepare.lemmatize(x))
df

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
5,DerekBabb/CyberSecurity,Java,# Cyber Security\n### A curriculum for a high ...,cyber security curriculum high school cyber se...,cyber secur curriculum high school cyber secur...,cyber security curriculum high school cyber se...
7,PacktPublishing/Machine-Learning-for-Cybersecu...,Jupyter Notebook,# Machine Learning for Cybersecurity Cookbook ...,machine learning cybersecurity cookbook hrefht...,machin learn cybersecur cookbook hrefhttpswwwp...,machine learning cybersecurity cookbook hrefht...
8,llSourcell/Build-a-Cybersecurity-Startup,JavaScript,# Overview\n\nThis is the code for [this](http...,overview code thishttpsyoutubebxw8vqxxvqc vide...,overview code thishttpsyoutubebxw8vqxxvqc vide...,overview code thishttpsyoutubebxw8vqxxvqc vide...
15,scusec/Data-Mining-for-Cybersecurity,HTML,# Data-Mining-for-Cybersecurity\n\n本项目主要是课程《Da...,dataminingforcybersecurity data mining cyberse...,dataminingforcybersecur data mine cybersecur 2...,dataminingforcybersecurity data mining cyberse...
16,guidesmiths/cybersecurity-handbook,JavaScript,# Cybersecurity handbook\n\n![Cover image](pub...,cybersecurity handbook cover imagepubliccoverj...,cybersecur handbook cover imagepubliccoverjpg ...,cybersecurity handbook cover imagepubliccoverj...
...,...,...,...,...,...,...
461,jonathan6661/P1sty,Python,"# P1sty\n\n<p align=""center"">\n<img src=""https...",p1sty p aligncenter img srchttpsuserimagesgith...,p1sti p aligncent img srchttpsuserimagesgithub...,p1sty p aligncenter img srchttpsuserimagesgith...
464,Patrowl/PatrowlHears,Python,![](https://github.com/Patrowl/PatrowlDocs/blo...,httpsgithubcompatrowlpatrowldocsblobmasterimag...,httpsgithubcompatrowlpatrowldocsblobmasterimag...,httpsgithubcompatrowlpatrowldocsblobmasterimag...
466,tropicoo/zoneh,Python,# zoneh\nZone-H cybercrime archive monitoring ...,zoneh zoneh cybercrime archive monitoring tele...,zoneh zoneh cybercrim archiv monitor telegram ...,zoneh zoneh cybercrime archive monitoring tele...
467,diogo-fernan/domfind,Python,# *domfind*\n\n*domfind* is a Python 3.6.x uti...,domfind domfind python 36x utility tests exist...,domfind domfind python 36x util test exist dom...,domfind domfind python 36x utility test existe...


In [48]:
#use clean function to create six sets of data: Python, Jupyter Notebook, HTML, Java, Shell, JavaScript and CSS and rejoin lemmatized words to one string
python_words = clean(' '.join(df.lemmatized[df.language == 'Python']))
jupyter_words = clean(' '.join(df.lemmatized[df.language == 'Jupyter Notebook']))
html_words = clean(' '.join(df.lemmatized[df.language == 'HTML']))
java_words = clean(' '.join(df.lemmatized[df.language == 'Java']))
shell_words = clean(' '.join(df.lemmatized[df.language == 'Shell']))
jscript_words = clean(' '.join(df.lemmatized[df.language == 'JavaScript']))


### Explore

In [49]:
#finding most common bigrams in python words
top_10_python_bigrams = (pd.Series(nltk.ngrams(python_words, 2))
                      .value_counts()
                      .head(10))

top_10_python_bigrams.head()

(pip, install)       20
(cyber, security)    13
(domain, name)       12
(git, clone)         12
(aptget, install)    11
dtype: int64

In [50]:
#finding most common bigrams in jupyter words
top_10_jupyter_bigrams = (pd.Series(nltk.ngrams(jupyter_words, 2))
                      .value_counts()
                      .head(10))

top_10_jupyter_bigrams.head()

(machine, learning)    37
(window, mac)          16
(x, linux)             16
(jupyter, notebook)    16
(mac, o)               16
dtype: int64

In [51]:
#finding most common bigrams in html words
top_10_html_bigrams = (pd.Series(nltk.ngrams(html_words, 2))
                      .value_counts()
                      .head(10))

top_10_html_bigrams.head()

(9, 9)                      9
(social, engineering)       8
(capture, flag)             7
(assignment, submission)    5
(attack, social)            5
dtype: int64

In [52]:
#finding most common bigrams in java words
top_10_java_bigrams = (pd.Series(nltk.ngrams(java_words, 2))
                      .value_counts()
                      .head(10))

top_10_java_bigrams.head()

(cyber, security)       28
(step, reproduce)       21
(reproduce, 1)          16
(username, password)    12
(1, open)               11
dtype: int64

In [53]:
#finding most common bigrams in shell words
top_10_shell_bigrams = (pd.Series(nltk.ngrams(shell_words, 2))
                      .value_counts()
                      .head(10))

top_10_shell_bigrams.head()

(elk, server)           8
(container, running)    8
(running, dvwa)         7
(host, container)       6
(br, download)          6
dtype: int64

In [54]:
#finding most common bigrams in jscript words
top_10_jscript_bigrams = (pd.Series(nltk.ngrams(jscript_words, 2))
                      .value_counts()
                      .head(10))

top_10_jscript_bigrams.head()

(sulla, blockchain)    17
(che, ci)              16
(styleborder, 0h3a)    15
(td, styleborder)      15
(npm, install)         11
dtype: int64

### Split Data

In [55]:
# We'll use this split function later to create in-sample and out-of-sample datasets for modeling
def split(df, stratify_by=None):
    """
    3 way split for train, validate, and test datasets
    To stratify, send in a column name
    """
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test


train, validate, test = split(df, 'language')


In [56]:
train

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
384,chygozprivaterepo/CybersecurityAE1,Java,# CybersecurityAE1\n,cybersecurityae1,cybersecurityae1,cybersecurityae1
405,YriyAntonov228/CyberSecurity,Java,# CyberSecurity - это электронная проходная\n#...,cybersecurity java 18 oracle jdk open jdk http...,cybersecur java 18 oracl jdk open jdk httpsdis...,cybersecurity java 18 oracle jdk open jdk http...
286,levitannin/AI-in-Cybersecurity,Jupyter Notebook,# AI-in-Cybersecurity\nPrograms worked on in u...,aiincybersecurity programs worked university c...,aiincybersecur program work univers cours arti...,aiincybersecurity program worked university co...
147,GambuzX/Cybersecurity_Practice,Python,# CTF_Practice,ctfpractice,ctfpractic,ctfpractice
249,Philser/cybersecurity,Python,# cybersecurity\nRepository for everything cyb...,cybersecurity repository everything cybersecurity,cybersecur repositori everyth cybersecur,cybersecurity repository everything cybersecurity
...,...,...,...,...,...,...
456,marianomatelo/MLCybersecurity,Python,# MLCybersecurity\nMachine Learning Cybersecur...,mlcybersecurity machine learning cybersecurity...,mlcybersecur machin learn cybersecur project f...,mlcybersecurity machine learning cybersecurity...
392,nikomn/cybersecuritybase2021-project1,Python,# cybersecuritybase2021-project1\n\nCourse pro...,cybersecuritybase2021project1 course project c...,cybersecuritybase2021project1 cours project cy...,cybersecuritybase2021project1 course project c...
211,KausikN/CyberSecurity,Python,# CyberSecurity\n Cyber Security Codes for Cip...,cybersecurity cyber security codes ciphers enc...,cybersecur cyber secur code cipher encrypt,cybersecurity cyber security code cipher encry...
425,LucienCastle/CyberSecurity,Jupyter Notebook,# CyberSecurity\nData encryption algortihms an...,cybersecurity data encryption algortihms ciphers,cybersecur data encrypt algortihm cipher,cybersecurity data encryption algortihms cipher


In [57]:
# Setup our X variables
X_train = train.lemmatized
X_validate = validate.lemmatized
X_test = test.lemmatized



In [24]:
# Setup our y variables
y_train = train.language
y_validate = validate.language
y_test = test.language

In [25]:
#Create dataframes
X_train = pd.DataFrame(X_train)
X_validate = pd.DataFrame(X_validate)
X_test = pd.DataFrame(X_test)


In [26]:
baseline = pd.DataFrame(y_train)
baseline['baseline'] = 'Python'

In [27]:
baseline

Unnamed: 0,language,baseline
384,Java,Python
405,Java,Python
286,Jupyter Notebook,Python
147,Python,Python
249,Python,Python
...,...,...
456,Python,Python
392,Python,Python
211,Python,Python
425,Jupyter Notebook,Python


In [28]:
# Baselinebaseline accuracy
baseline_accuracy = (baseline.language == baseline.baseline).mean()
baseline_accuracy

0.4205607476635514

In [30]:
# Create the tfidf vectorizer object
# Step 1, this creates a tf-idf values for each word, for each document
# Step 2, encodes these values so that we can use models that only work on numbers, like classifications model
tfidf = TfidfVectorizer()

# Fit on the training data
tfidf.fit(X_train)

# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

In [33]:
# Sparse vectors/matrices have tons of zeros
X_train_vectorized.todense()

matrix([[1.]])

In [34]:
# Now that we have a vectorized dataset, we can use our classification tools!
lm = LogisticRegression()

# Fit the classification model on our vectorized train data
lm.fit(X_train_vectorized, y_train)



ValueError: Found input variables with inconsistent numbers of samples: [1, 107]

In [None]:
#use the model
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [None]:
# Use the trained model to predict y given those vectorized inputs of X
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [None]:
# Train Accuracy
train_accuracy = (train.actual == train.predicted).mean()
train_accuracy

In [None]:
# Out of sample accuracy
validate_accuracy = (validate.actual == validate.predicted).mean()
validate_accuracy

In [None]:
#begin building a dataframe to record accuracy
metric_df = pd.DataFrame(data=[{
    'model': 'logistic regression', 
    'baseline_accuracy': round(baseline_accuracy,2),
    'train_accuracy': round(train_accuracy, 2),
    'validate_accuracy': round(validate_accuracy, 2)}])
metric_df

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


#Create the model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [None]:
#Fit the model
rf.fit(X_train_vectorized, y_train)

In [None]:
#Check feature importance
print(rf.feature_importances_)

In [None]:
#Make predictions
y_train_pred = rf.predict(X_train_vectorized)
y_validate_pred = rf.predict(X_validate_vectorized)

In [None]:
#Estimate the probability
y_train_pred_proba = rf.predict_proba(X_train_vectorized)
y_validate_pred_proba = rf.predict_proba(X_validate_vectorized)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train_vectorized, y_train)))

In [None]:
rf_train_accuracy = round(rf.score(X_train_vectorized, y_train),2)

In [None]:
print(confusion_matrix(y_train, y_train_pred))

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
### Less precision than Logistic Regression -- will not run on test!
#Check accuracy on validate
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf.score(X_validate_vectorized, y_validate)))

In [None]:
rf_validate_accuracy = round(rf.score(X_validate_vectorized, y_validate),2)

In [None]:
#append dataframe to compare accuracy
metric_df = metric_df.append({
    'model': 'random_forest', 
    'train_accuracy': rf_train_accuracy,
    'validate_accuracy': rf_validate_accuracy}, ignore_index=True)
metric_df


### K-Nearest Neighbor

In [None]:
#imports 
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Create the object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
#Fit the model
knn.fit(X_train_vectorized, y_train)

In [None]:
#Make predictions
y_train_pred = knn.predict(X_train_vectorized)

In [None]:
#Estimate probability
y_train_pred_proba = knn.predict_proba(X_train_vectorized)

In [None]:
#Evaluate on accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train_vectorized, y_train)))

In [None]:
knn_train_accuracy = knn.score(X_train_vectorized, y_train)
knn_train_accuracy

In [None]:
#Make predictions
y_validate_pred = knn.predict(X_validate_vectorized)

In [None]:
#Estimate probability
y_validate_pred_proba = knn.predict_proba(X_validate_vectorized)

In [None]:
#Evaluate on accuracy
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate_vectorized, y_validate)))

In [None]:
knn_validate_accuracy = round(knn.score(X_validate_vectorized, y_validate),2)
knn_validate_accuracy

In [None]:
#append dataframe to compare accuracy
metric_df = metric_df.append({
    'model': 'K-Nearest Neighbor', 
    'train_accuracy': round(knn_train_accuracy,2),
    'validate_accuracy': knn_validate_accuracy}, ignore_index=True)
metric_df


In [None]:
df.readme_contents