In [1]:
#imports
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import unicodedata
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import env
import acquire, prepare
import requests as req
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

### Acquire 

In [2]:
#acquire github data on repos referencing 470 cyber security repositories
# scrape = acquire.scrape_github_data()

In [3]:
# len(scrape)

In [4]:
# df = pd.DataFrame(scrape)
# df

In [5]:
# data = df.to_csv('repo_readmes.csv')
df = pd.read_csv('repo_readmes.csv', usecols=['repo','language','readme_contents'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             470 non-null    object
 1   language         305 non-null    object
 2   readme_contents  395 non-null    object
dtypes: object(3)
memory usage: 11.1+ KB


### Prepare

In [7]:
df = df[df.language.notnull()]
df = df[df.readme_contents.notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254 entries, 5 to 469
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             254 non-null    object
 1   language         254 non-null    object
 2   readme_contents  254 non-null    object
dtypes: object(3)
memory usage: 7.9+ KB


In [8]:
# Dropped languages with only 1 observation
# Dropped 165 nulls in language
df.value_counts('language')

language
Python              80
Jupyter Notebook    36
HTML                26
Java                19
Shell               17
JavaScript          14
CSS                 11
C                    6
C++                  6
PHP                  5
C#                   5
TeX                  4
PowerShell           3
Dart                 2
R                    2
Ruby                 2
Dockerfile           2
Pug                  2
Batchfile            1
Go                   1
Verilog              1
HCL                  1
Haxe                 1
TypeScript           1
Kotlin               1
Objective-C          1
Ren'Py               1
SCSS                 1
Scala                1
Assembly             1
dtype: int64

In [9]:
#limit dataframe to the top 6 languages
top_6_languages = df.language.value_counts().index[0:6]

In [10]:
df = df[df.language.isin(top_6_languages)]
df

Unnamed: 0,repo,language,readme_contents
5,DerekBabb/CyberSecurity,Java,# Cyber Security\n### A curriculum for a high ...
7,PacktPublishing/Machine-Learning-for-Cybersecu...,Jupyter Notebook,# Machine Learning for Cybersecurity Cookbook ...
8,llSourcell/Build-a-Cybersecurity-Startup,JavaScript,# Overview\n\nThis is the code for [this](http...
15,scusec/Data-Mining-for-Cybersecurity,HTML,# Data-Mining-for-Cybersecurity\n\n本项目主要是课程《Da...
16,guidesmiths/cybersecurity-handbook,JavaScript,# Cybersecurity handbook\n\n![Cover image](pub...
...,...,...,...
461,jonathan6661/P1sty,Python,"# P1sty\n\n<p align=""center"">\n<img src=""https..."
464,Patrowl/PatrowlHears,Python,![](https://github.com/Patrowl/PatrowlDocs/blo...
466,tropicoo/zoneh,Python,# zoneh\nZone-H cybercrime archive monitoring ...
467,diogo-fernan/domfind,Python,# *domfind*\n\n*domfind* is a Python 3.6.x uti...


In [11]:
# # prepare the data by applying the clean function
# df.readme_contents = df.readme_contents.apply(prepare.clean)
# df

In [12]:
def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words]

In [13]:
#clean to hold the normalized and tokenized original with the stopwords removed.
df['clean'] = df['readme_contents'].apply(lambda x: prepare.remove_stopwords(prepare.tokenize(prepare.basic_clean(x))))
#stemmed to hold the stemmed version of the cleaned data.
df['stemmed'] = df['clean'].apply(lambda x: prepare.stem(x))
#lemmatized to hold the lemmatized version of the cleaned data.
df['lemmatized'] = df['clean'].apply(lambda x: prepare.lemmatize(x))
df

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
5,DerekBabb/CyberSecurity,Java,# Cyber Security\n### A curriculum for a high ...,cyber security curriculum high school cyber se...,cyber secur curriculum high school cyber secur...,cyber security curriculum high school cyber se...
7,PacktPublishing/Machine-Learning-for-Cybersecu...,Jupyter Notebook,# Machine Learning for Cybersecurity Cookbook ...,machine learning cybersecurity cookbook hrefht...,machin learn cybersecur cookbook hrefhttpswwwp...,machine learning cybersecurity cookbook hrefht...
8,llSourcell/Build-a-Cybersecurity-Startup,JavaScript,# Overview\n\nThis is the code for [this](http...,overview code thishttpsyoutubebxw8vqxxvqc vide...,overview code thishttpsyoutubebxw8vqxxvqc vide...,overview code thishttpsyoutubebxw8vqxxvqc vide...
15,scusec/Data-Mining-for-Cybersecurity,HTML,# Data-Mining-for-Cybersecurity\n\n本项目主要是课程《Da...,dataminingforcybersecurity data mining cyberse...,dataminingforcybersecur data mine cybersecur 2...,dataminingforcybersecurity data mining cyberse...
16,guidesmiths/cybersecurity-handbook,JavaScript,# Cybersecurity handbook\n\n![Cover image](pub...,cybersecurity handbook cover imagepubliccoverj...,cybersecur handbook cover imagepubliccoverjpg ...,cybersecurity handbook cover imagepubliccoverj...
...,...,...,...,...,...,...
461,jonathan6661/P1sty,Python,"# P1sty\n\n<p align=""center"">\n<img src=""https...",p1sty p aligncenter img srchttpsuserimagesgith...,p1sti p aligncent img srchttpsuserimagesgithub...,p1sty p aligncenter img srchttpsuserimagesgith...
464,Patrowl/PatrowlHears,Python,![](https://github.com/Patrowl/PatrowlDocs/blo...,httpsgithubcompatrowlpatrowldocsblobmasterimag...,httpsgithubcompatrowlpatrowldocsblobmasterimag...,httpsgithubcompatrowlpatrowldocsblobmasterimag...
466,tropicoo/zoneh,Python,# zoneh\nZone-H cybercrime archive monitoring ...,zoneh zoneh cybercrime archive monitoring tele...,zoneh zoneh cybercrim archiv monitor telegram ...,zoneh zoneh cybercrime archive monitoring tele...
467,diogo-fernan/domfind,Python,# *domfind*\n\n*domfind* is a Python 3.6.x uti...,domfind domfind python 36x utility tests exist...,domfind domfind python 36x util test exist dom...,domfind domfind python 36x utility test existe...


In [14]:
#use clean function to create six sets of data: Python, Jupyter Notebook, HTML, Java, Shell, JavaScript and CSS and rejoin lemmatized words to one string
python_words = clean(' '.join(df.lemmatized[df.language == 'Python']))
jupyter_words = clean(' '.join(df.lemmatized[df.language == 'Jupyter Notebook']))
html_words = clean(' '.join(df.lemmatized[df.language == 'HTML']))
java_words = clean(' '.join(df.lemmatized[df.language == 'Java']))
shell_words = clean(' '.join(df.lemmatized[df.language == 'Shell']))
jscript_words = clean(' '.join(df.lemmatized[df.language == 'JavaScript']))


### Explore

In [15]:
#finding most common bigrams in python words
top_10_python_bigrams = (pd.Series(nltk.ngrams(python_words, 2))
                      .value_counts()
                      .head(10))

top_10_python_bigrams.head()

(pip, install)       20
(cyber, security)    13
(domain, name)       12
(git, clone)         12
(aptget, install)    11
dtype: int64

In [16]:
#finding most common bigrams in jupyter words
top_10_jupyter_bigrams = (pd.Series(nltk.ngrams(jupyter_words, 2))
                      .value_counts()
                      .head(10))

top_10_jupyter_bigrams.head()

(machine, learning)    37
(mac, o)               16
(window, mac)          16
(x, linux)             16
(o, x)                 16
dtype: int64

In [17]:
#finding most common bigrams in html words
top_10_html_bigrams = (pd.Series(nltk.ngrams(html_words, 2))
                      .value_counts()
                      .head(10))

top_10_html_bigrams.head()

(9, 9)                   9
(social, engineering)    8
(capture, flag)          7
(attack, social)         5
(ip, address)            5
dtype: int64

In [18]:
#finding most common bigrams in java words
top_10_java_bigrams = (pd.Series(nltk.ngrams(java_words, 2))
                      .value_counts()
                      .head(10))

top_10_java_bigrams.head()

(cyber, security)       28
(step, reproduce)       21
(reproduce, 1)          16
(username, password)    12
(1, open)               11
dtype: int64

In [19]:
#finding most common bigrams in shell words
top_10_shell_bigrams = (pd.Series(nltk.ngrams(shell_words, 2))
                      .value_counts()
                      .head(10))

top_10_shell_bigrams.head()

(elk, server)           8
(container, running)    8
(running, dvwa)         7
(br, download)          6
(host, container)       6
dtype: int64

In [20]:
#finding most common bigrams in jscript words
top_10_jscript_bigrams = (pd.Series(nltk.ngrams(jscript_words, 2))
                      .value_counts()
                      .head(10))

top_10_jscript_bigrams.head()

(sulla, blockchain)    17
(che, ci)              16
(td, styleborder)      15
(styleborder, 0h3a)    15
(npm, install)         11
dtype: int64

### Establish baseline for modeling

In [21]:
baseline = pd.DataFrame(df.language)
baseline['baseline'] = 'Python'

In [22]:
baseline

Unnamed: 0,language,baseline
5,Java,Python
7,Jupyter Notebook,Python
8,JavaScript,Python
15,HTML,Python
16,JavaScript,Python
...,...,...
461,Python,Python
464,Python,Python
466,Python,Python
467,Python,Python


### Baseline accuracy is 42%

In [23]:
# Baselinebaseline accuracy
baseline_accuracy = round((baseline.language == baseline.baseline).mean(),2)
baseline_accuracy

0.42

### Split Data

In [24]:
# Split the data for modeling
train, validate, test = prepare.split(df, 'language')
train.shape, validate.shape, test.shape

((107, 6), (46, 6), (39, 6))

In [25]:
# setup X-train, validate and test variables
X_train = train.lemmatized
X_validate = validate.lemmatized
X_test = test.lemmatized

# setup y-train, validate and test variables
y_train = train.language
y_validate = validate.language
y_test = test.language

In [26]:
X_train.shape, X_validate.shape, X_test.shape

((107,), (46,), (39,))

In [27]:
y_train.shape, y_validate.shape, y_test.shape

((107,), (46,), (39,))

### Vectorizing data before classification modeling
- converting text to numerical representations

In [28]:
# Create the tfidf vectorizer object
# Step 1: create tf-idf value for each word, for each lemmatized readme
# Step 2: encode these values for use on models that only work on numbers, like classifications model
tfidf = TfidfVectorizer()

# Fit on the training data
tfidf.fit(X_train)

# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

### Linear Regression

In [29]:
# Create logistic regression object -- to use on vectorized data
lm = LogisticRegression()

# Fit the classification model on our vectorized train data
lm.fit(X_train_vectorized, y_train)



LogisticRegression()

In [30]:
#create dataframes of actual values
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [31]:
# Use the trained model to predict y given those vectorized inputs of X
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [32]:
# Train Accuracy
train_accuracy = round((train.actual == train.predicted).mean(),2)
train_accuracy

0.68

In [33]:
# Validate Accuracy
validate_accuracy = round((validate.actual == validate.predicted).mean(),2)
validate_accuracy

0.52

In [34]:
#begin building a dataframe to record accuracy
metric_df = pd.DataFrame(data=[{
    'model': 'logistic regression', 
    'baseline_accuracy': round(baseline_accuracy,2),
    'train_accuracy': round(train_accuracy, 2),
    'validate_accuracy': round(validate_accuracy, 2)}])
metric_df

Unnamed: 0,model,baseline_accuracy,train_accuracy,validate_accuracy
0,logistic regression,0.42,0.68,0.52


### Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


#Create the model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [36]:
#Fit the model
rf.fit(X_train_vectorized, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)

In [37]:
#Check feature importance
print(rf.feature_importances_)

[0. 0. 0. ... 0. 0. 0.]


In [38]:
#Make predictions
y_train_pred = rf.predict(X_train_vectorized)
y_validate_pred = rf.predict(X_validate_vectorized)

In [39]:
#Estimate the probability
y_train_pred_proba = rf.predict_proba(X_train_vectorized)
y_validate_pred_proba = rf.predict_proba(X_validate_vectorized)

In [40]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train_vectorized, y_train)))

Accuracy of random forest classifier on training set: 0.42


In [41]:
rf_train_accuracy = round(rf.score(X_train_vectorized, y_train),2)

In [42]:
print(confusion_matrix(y_train, y_train_pred))

[[ 0  0  0  0 15  0]
 [ 0  0  0  0 10  0]
 [ 0  0  0  0  8  0]
 [ 0  0  0  0 20  0]
 [ 0  0  0  0 45  0]
 [ 0  0  0  0  9  0]]


In [43]:
print(classification_report(y_train, y_train_pred))

                  precision    recall  f1-score   support

            HTML       0.00      0.00      0.00        15
            Java       0.00      0.00      0.00        10
      JavaScript       0.00      0.00      0.00         8
Jupyter Notebook       0.00      0.00      0.00        20
          Python       0.42      1.00      0.59        45
           Shell       0.00      0.00      0.00         9

        accuracy                           0.42       107
       macro avg       0.07      0.17      0.10       107
    weighted avg       0.18      0.42      0.25       107



  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
### Less precision than Logistic Regression -- will not run on test!
#Check accuracy on validate
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf.score(X_validate_vectorized, y_validate)))

Accuracy of random forest classifier on validate set: 0.41


In [45]:
rf_validate_accuracy = round(rf.score(X_validate_vectorized, y_validate),2)

In [69]:
#append dataframe to compare accuracy
metric_df = metric_df.append({
    'model': 'random_forest', 
    'baseline_accuracy': baseline_accuracy,
    'train_accuracy': rf_train_accuracy,
    'validate_accuracy': rf_validate_accuracy}, ignore_index=True)
metric_df


Unnamed: 0,model,baseline_accuracy,train_accuracy,validate_accuracy
0,logistic regression,0.42,0.68,0.52
1,random_forest,0.42,0.42,0.41
2,K-Nearest Neighbor,0.42,0.63,0.43
3,random_forest,0.42,0.42,0.41


### K-Nearest Neighbor

In [47]:
#imports 
from sklearn.neighbors import KNeighborsClassifier

In [48]:
# Create the object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [49]:
#Fit the model
knn.fit(X_train_vectorized, y_train)

KNeighborsClassifier()

In [50]:
#Make predictions
y_train_pred = knn.predict(X_train_vectorized)

In [51]:
#Estimate probability
y_train_pred_proba = knn.predict_proba(X_train_vectorized)

In [52]:
#Evaluate on accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train_vectorized, y_train)))

Accuracy of KNN classifier on training set: 0.63


In [53]:
knn_train_accuracy = knn.score(X_train_vectorized, y_train)
knn_train_accuracy

0.6261682242990654

In [54]:
#Make predictions
y_validate_pred = knn.predict(X_validate_vectorized)

In [55]:
#Estimate probability
y_validate_pred_proba = knn.predict_proba(X_validate_vectorized)

In [56]:
#Evaluate on accuracy
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate_vectorized, y_validate)))

Accuracy of KNN classifier on validate set: 0.43


In [57]:
knn_validate_accuracy = round(knn.score(X_validate_vectorized, y_validate),2)
knn_validate_accuracy

0.43

In [58]:
#append dataframe to compare accuracy
metric_df = metric_df.append({
    'model': 'K-Nearest Neighbor', 
    'baseline_accuracy': baseline_accuracy,
    'train_accuracy': round(knn_train_accuracy,2),
    'validate_accuracy': knn_validate_accuracy}, ignore_index=True)
metric_df


Unnamed: 0,model,baseline_accuracy,train_accuracy,validate_accuracy
0,logistic regression,0.42,0.68,0.52
1,random_forest,0.42,0.42,0.41
2,K-Nearest Neighbor,0.42,0.63,0.43


### Model on test data

In [65]:
#Make predictions
y_test_pred = knn.predict(X_test_vectorized)

In [66]:
#Estimate probability
y_test_pred_proba = knn.predict_proba(X_test_vectorized)

In [67]:
#Evaluate on accuracy
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test_vectorized, y_test)))

Accuracy of KNN classifier on test set: 0.49


In [68]:
knn_test_accuracy = knn.score(X_test_vectorized, y_test)
knn_test_accuracy

0.48717948717948717