In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import os
data_path = ['..', '..', 'data']
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [2]:
data = pd.read_csv('phishing.csv')


In [4]:
y = (data['Result'] == -1).astype(int)
fields = list(data.columns[:-1]) 
correlations = data[fields].corrwith(y)
correlations.sort_values(inplace=True)
correlations


SSLfinal_State                -0.714741
URL_of_Anchor                 -0.692935
Prefix_Suffix                 -0.348606
web_traffic                   -0.346103
having_Sub_Domain             -0.298323
Request_URL                   -0.253372
Links_in_tags                 -0.248229
SFH                           -0.221419
Google_Index                  -0.128950
age_of_domain                 -0.121496
Page_Rank                     -0.104645
having_IP_Address             -0.094160
Statistical_report            -0.079857
DNSRecord                     -0.075718
URL_Length                    -0.057430
having_At_Symbol              -0.052948
on_mouseover                  -0.041838
port                          -0.036419
Links_pointing_to_page        -0.032574
Submitting_to_email           -0.018249
RightClick                    -0.012653
popUpWidnow                   -0.000086
Favicon                        0.000280
Iframe                         0.003394
Redirect                       0.020113


In [None]:
sns.set_context('talk')
sns.set_palette('dark')
sns.set_style('white')


In [None]:
sns.pairplot(data, hue='Result')


In [None]:
ax = correlations.plot(kind='bar')
ax.set(ylim=[-1, 1], ylabel='pearson correlation');


In [None]:
fields = correlations.map(abs).sort_values().iloc[-2:].index
print(fields)
X = data[fields]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=['%s_scaled' % fld for fld in fields])
print(X.columns)


In [None]:
LSVC = LinearSVC()
LSVC.fit(X,y)

X_color = X.sample(300, random_state=45)
y_color = y.loc[X_color.index]
y_color = y_color.map(lambda r: 'red' if r == 1 else 'yellow')
ax = plt.axes()
ax.scatter(X_color.iloc[:, 0], X_color.iloc[:, 1],color=y_color, alpha=1)
# -----------
x_axis, y_axis = np.arange(0, 1.005, .005), np.arange(0, 1.005, .005)
xx, yy = np.meshgrid(x_axis, y_axis)
xx_ravel = xx.ravel()
yy_ravel = yy.ravel()
X_grid = pd.DataFrame([xx_ravel, yy_ravel]).T
y_grid_predictions = LSVC.predict(X_grid)
y_grid_predictions = y_grid_predictions.reshape(xx.shape)
ax.contourf(xx, yy, y_grid_predictions, cmap=plt.cm.autumn_r, alpha=.3)
# -----------
ax.set(
    xlabel=fields[0],
    ylabel=fields[1],
    xlim=[0, 1],
    ylim=[0, 1],
    title='decision boundary for LinearSVC');


In [None]:
def plot_decision_boundary(estimator, X, y):
    estimator.fit(X, y)
    X_color = X.sample(300)
    y_color = y.loc[X_color.index]
    y_color = y_color.map(lambda r: 'red' if r == 1 else 'yellow')
    x_axis, y_axis = np.arange(0, 1, .005), np.arange(0, 1, .005)
    xx, yy = np.meshgrid(x_axis, y_axis)
    xx_ravel = xx.ravel()
    yy_ravel = yy.ravel()
    X_grid = pd.DataFrame([xx_ravel, yy_ravel]).T
    y_grid_predictions = estimator.predict(X_grid)
    y_grid_predictions = y_grid_predictions.reshape(xx.shape)

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.contourf(xx, yy, y_grid_predictions, cmap=plt.cm.autumn_r, alpha=.3)
    ax.scatter(X_color.iloc[:, 0], X_color.iloc[:, 1], color=y_color, alpha=1)
    ax.set(
        xlabel=fields[0],
        ylabel=fields[1],
        title=str(estimator))


In [None]:
gammas = [.5, 1, 2, 10]
for gamma in gammas:
    SVC_Gaussian = SVC(kernel='rbf', gamma=gamma)
    plot_decision_boundary(SVC_Gaussian, X, y)


In [None]:
Cs = [.1, 1, 10]
for C in Cs:
    SVC_Gaussian = SVC(kernel='rbf', gamma=2, C=C)
    plot_decision_boundary(SVC_Gaussian, X, y)


In [None]:
y = data['Result'] == -1
X = data[data.columns[:-1]]
kwargs = {'kernel': 'rbf'}
svc = SVC(**kwargs)
nystroem = Nystroem(**kwargs)
sgd = SGDClassifier()


In [None]:
%%timeit
svc.fit(X, y)


In [None]:
%%timeit
X_transformed = nystroem.fit_transform(X)
sgd.fit(X_transformed, y)


In [None]:
X2 = pd.concat([X]*5)
y2 = pd.concat([y]*5)

# X2 = pd.concat([X]*7)
# y2 = pd.concat([y]*3)

# X2 = pd.concat([X]*9)
# y2 = pd.concat([y]*1)

print(X2.shape)
print(y2.shape)


In [None]:
%%timeit 
svc.fit(X2, y2)


In [None]:
%%timeit
X2_transformed = nystroem.fit_transform(X2)
sgd.fit(X2_transformed, y2)
