In [80]:
from ucimlrepo import fetch_ucirepo


In [81]:
# fetch dataset
phiusiil_phishing_url_website = fetch_ucirepo(id=967)

In [82]:
# data (as pandas dataframes)
X = phiusiil_phishing_url_website.data.features
y = phiusiil_phishing_url_website.data.targets

X.dtypes
# Drop Non Numeric
X.drop(['URL', 'TLD', 'Domain', 'Title'], inplace=True, axis=1)
X.to_csv("phishing_url.csv", index=False, header=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(['URL', 'TLD', 'Domain', 'Title'], inplace=True, axis=1)


In [83]:
X.dtypes

URLLength                       int64
DomainLength                    int64
IsDomainIP                      int64
URLSimilarityIndex            float64
CharContinuationRate          float64
TLDLegitimateProb             float64
URLCharProb                   float64
TLDLength                       int64
NoOfSubDomain                   int64
HasObfuscation                  int64
NoOfObfuscatedChar              int64
ObfuscationRatio              float64
NoOfLettersInURL                int64
LetterRatioInURL              float64
NoOfDegitsInURL                 int64
DegitRatioInURL               float64
NoOfEqualsInURL                 int64
NoOfQMarkInURL                  int64
NoOfAmpersandInURL              int64
NoOfOtherSpecialCharsInURL      int64
SpacialCharRatioInURL         float64
IsHTTPS                         int64
LineOfCode                      int64
LargestLineLength               int64
HasTitle                        int64
DomainTitleMatchScore         float64
URLTitleMatc

# Decision Tree Classifier:  Un-Scaled

In [84]:
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Create the decision tree classifier instance
model_not_scaled = tree.DecisionTreeClassifier()

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit the model - without scaling
model_not_scaled = model_not_scaled.fit(X_train, y_train)

print(f"Training Data Score: {model_not_scaled.score(X_train, y_train)}")
print(f"Testing Data Score: {model_not_scaled.score(X_test, y_test)}")

# Making predictions using the testing data - without scaling
predictions = model_not_scaled.predict(X_test)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

print(f"Accuracy Score : {acc_score}")

feature_importances = model_not_scaled.tree_.compute_feature_importances(normalize=False)
#print("feat importance: model_not_scaled = " + str(feature_importances))
importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
importances_sorted[:10]

print(classification_report(y_test, predictions))

Training Data Score: 1.0
Testing Data Score: 1.0
Accuracy Score : 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25321
           1       1.00      1.00      1.00     33628

    accuracy                           1.00     58949
   macro avg       1.00      1.00      1.00     58949
weighted avg       1.00      1.00      1.00     58949



# Decision Tree Classifier: Scaled

In [85]:
from sklearn.preprocessing import StandardScaler

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create the decision tree classifier instance
model_scaled = tree.DecisionTreeClassifier()

# Fit the model - With Scaling
model_scaled = model_scaled.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model_not_scaled.score(X_train, y_train)}")
print(f"Testing Data Score: {model_not_scaled.score(X_test, y_test)}")

# Making predictions using the testing data - without scaling
predictions = model_scaled.predict(X_test_scaled)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

print(f"Accuracy Score : {acc_score}")

feat_importance = model_scaled.tree_.compute_feature_importances(normalize=False)
#print("feat importance: model_scaled: = " + str(feat_importance))
importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
importances_sorted[:10]



Training Data Score: 1.0
Testing Data Score: 1.0
Accuracy Score : 1.0


[(0.48310272154564027, 'URLSimilarityIndex'),
 (0.006217841735266813, 'LineOfCode'),
 (0.00019222158898808743, 'IsHTTPS'),
 (1.1309163009894951e-05, 'NoOfSubDomain'),
 (0.0, 'URLTitleMatchScore'),
 (0.0, 'URLLength'),
 (0.0, 'URLCharProb'),
 (0.0, 'TLDLength'),
 (0.0, 'TLDLegitimateProb'),
 (0.0, 'SpacialCharRatioInURL')]