In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn import svm, linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# read and peak data
df = pd.read_csv('./data.csv')
df[:5]
# excluded the last 6 parameters
X = df.iloc[:, 1:43]
y = df.iloc[:, 49]
n_rows = len(X)
print(X)

      NumDots  SubdomainLevel  PathLevel  UrlLength  NumDash  \
0           3               1          5         72        0   
1           3               1          3        144        0   
2           3               1          2         58        0   
3           3               1          6         79        1   
4           3               0          4         46        0   
...       ...             ...        ...        ...      ...   
9995        3               1          1         50        0   
9996        2               1          4         59        1   
9997        2               1          4         57        0   
9998        3               1          1         49        0   
9999        3               1          2         52        3   

      NumDashInHostname  AtSymbol  TildeSymbol  NumUnderscore  NumPercent  \
0                     0         0            0              0           0   
1                     0         0            0              2           0   


In [3]:
# split data and store indices
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.2, random_state=0, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5, random_state=0, stratify=y_tmp)
indices_train, indices_tmp = train_test_split(np.arange(n_rows), test_size=.2, random_state=0, stratify=y)
indices_valid, indices_test = train_test_split(indices_tmp, test_size=.5, random_state=0, stratify=y[indices_tmp])

In [4]:
# start_1 = time.time()
# clf = svm.SVC(kernel="linear", C=2000)
# clf.fit(X_train,y_train)
# end_1 = time.time()

# print(f'SVM Training accuracy: {clf.score(X_valid, y_valid):.3}')



In [5]:
# print(f'SVM(linear, C=2000) performance: {(end_1-start_1):.3}')

In [6]:
# Logistic regression
clf = linear_model.LogisticRegression(max_iter=100000, C=3000)
start_1 = time.time()
clf.fit(X_train, y_train)
end_1 = time.time()
print(f'Logistic regression Training accuracy: {clf.score(X_valid, y_valid):.3}')
print(f'Logistic regression performance: {(end_1-start_1):.3}')





Logistic regression Training accuracy: 0.941
Logistic regression performance: 1.75


In [7]:
clf = linear_model.LinearRegression()
start_1 = time.time()
clf.fit(X_train, y_train)
end_1 = time.time()
print(f'Linear regression Training accuracy: {clf.score(X_valid, y_valid):.3}')
print(f'Linear regression performance: {(end_1-start_1):.3}')



Linear regression Training accuracy: 0.688
Linear regression performance: 0.00748


In [8]:
# ID3 Decision Tree
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'DecisionTree validation_score={validation_score:.3}')

DecisionTree validation_score=0.918


In [9]:
# ID3 Decision Tree grid search
parameters = {'criterion': ['entropy'], 'max_depth': [1, 3, 5, 7, 9]}
id3 = DecisionTreeClassifier()
clf = GridSearchCV(id3, parameters)
clf.fit(X_train, y_train)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
      f'clf.best_params_={clf.best_params_}' + str(clf.best_estimator_))
start_1 = time.time()
clf.best_estimator_.fit(X_train, y_train)
end_1 = time.time()
print(f'ID3 performance: {(end_1-start_1):.3}')



clf.best_score_=0.966, clf.best_params_={'criterion': 'entropy', 'max_depth': 7}DecisionTreeClassifier(criterion='entropy', max_depth=7)
ID3 performance: 0.0218


In [10]:
# bagging
clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=0)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'Bagging validation_score={validation_score:.3}')


# bagging
clf = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=10, random_state=0)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'Bagging validation_score={validation_score:.3}')


Bagging validation_score=0.977
Bagging validation_score=0.984


In [11]:
# bagging grid search (max: 0.964)
parameters = {'n_estimators': [1, 5, 10, 15]}
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=0)
clf = GridSearchCV(bagging, parameters)
clf.fit(X_train, y_train)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
      f'clf.best_params_={clf.best_params_}')

clf.best_score_=0.973, clf.best_params_={'n_estimators': 15}


In [12]:
# random forest
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'RandomForest validation_score={validation_score:.3}')

RandomForest validation_score=0.971


In [13]:
# random forest grid search (max: 0.974)
parameters = {'max_depth': [1, 5, 10, 15, 20, 30, 50, 100, 150]}
rf = RandomForestClassifier(random_state=0)
clf = GridSearchCV(rf, parameters)
clf.fit(X_train, y_train)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
      f'clf.best_params_={clf.best_params_}')

start_1 = time.time()
clf.best_estimator_.fit(X_train, y_train)
end_1 = time.time()
print(f'Random forest tree performance: {(end_1-start_1):.3}')





clf.best_score_=0.981, clf.best_params_={'max_depth': 30}
Random forest tree performance: 0.383


In [14]:
# gradient boosting
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf.fit(X_train, y_train)
cvalidation_score = clf.score(X_valid, y_valid)
print(f'GradientBoosting validation_score={validation_score:.3}')

GradientBoosting validation_score=0.971


In [15]:
# gradient boosting grid search
parameters = {'n_estimators': [1, 50, 100], 'learning_rate': [0.5, 1.0], 'max_depth': [1, 3, 5, 10, 15]}
gb = GradientBoostingClassifier(random_state=0)
clf = GridSearchCV(gb, parameters)
clf.fit(X_train, y_train)
print(f'clf.best_score_={clf.best_score_:.3}, ' +
      f'clf.best_params_={clf.best_params_}')

start_1 = time.time()
best = clf.best_estimator_
best.fit(X_train, y_train)
print(best.score(X_valid, y_valid))
end_1 = time.time()
print(f'GradientBoosting tree performance: {(end_1-start_1):.3}')


clf.best_score_=0.984, clf.best_params_={'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 100}
0.986
GradientBoosting tree performance: 0.855


### More classifiers
Maybe stacking the top 3.

### Application
Drawback: Data may be difficult to collect.
Solution: Use a few predictors and test the result.