In [121]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot  as plt
from sklearn.model_selection import GridSearchCV
import joblib


In [3]:
df= pd.read_csv(r"C:\Users\akash\Downloads\Phishing_Legitimate.csv")

## Data exploration:

In [4]:
df

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,1,3,1,5,72,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,2,3,1,3,144,0,0,0,0,2,...,0,0,0,1,-1,1,1,1,1,1
2,3,3,1,2,58,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,4,3,1,6,79,1,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,5,3,0,4,46,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,3,1,1,50,0,0,0,0,0,...,0,0,0,1,1,-1,1,0,1,0
9996,9997,2,1,4,59,1,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0
9997,9998,2,1,4,57,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
9998,9999,3,1,1,49,0,0,0,0,0,...,1,0,0,1,1,0,1,0,1,0


## Data Exploration:

In [5]:
df['NumDots'].describe()

count    10000.000000
mean         2.445100
std          1.346836
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max         21.000000
Name: NumDots, dtype: float64

In [6]:
x,y= df.drop('CLASS_LABEL',axis=1), df['CLASS_LABEL']


In [9]:
x

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
0,1,3,1,5,72,0,0,0,0,0,...,0,0,0,1,1,0,1,1,-1,1
1,2,3,1,3,144,0,0,0,0,2,...,0,0,0,0,1,-1,1,1,1,1
2,3,3,1,2,58,0,0,0,0,0,...,0,0,0,0,1,0,-1,1,-1,0
3,4,3,1,6,79,1,0,0,0,0,...,1,0,0,0,1,-1,1,1,1,-1
4,5,3,0,4,46,0,0,0,0,0,...,0,1,0,0,1,1,-1,0,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,3,1,1,50,0,0,0,0,0,...,0,0,0,0,1,1,-1,1,0,1
9996,9997,2,1,4,59,1,0,0,0,0,...,0,1,0,0,1,0,0,1,0,1
9997,9998,2,1,4,57,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,1
9998,9999,3,1,1,49,0,0,0,0,0,...,0,1,0,0,1,1,0,1,0,1


In [7]:
y

0       1
1       1
2       1
3       1
4       1
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: CLASS_LABEL, Length: 10000, dtype: int64

In [8]:
x_train,x_test, y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


In [15]:
#Testing function:
def model_predict_test(model,x_test,y_test):
    y_pred=model.predict(x_test)
    print("Model predicted sucessfully!")
    print('Accuracy Score:',accuracy_score(y_test,y_pred))
    print('precision Score:',precision_score(y_test,y_pred))
    print('Recall Score:',recall_score(y_test,y_pred))
    print('F1 Score:',f1_score(y_test,y_pred))
    cm=confusion_matrix(y_test,y_pred)
    print("Confusion Matrix: \n")    
    print(cm)
    
    
    
    

## Model Training:

In [17]:
baseline_model= RandomForestClassifier(n_jobs=-1, n_estimators =4)
baseline_model.fit(x_train,y_train)

In [18]:
model_predict_test(baseline_model,x_test,y_test)

Model predicted sucessfully!
Accuracy Score: 0.9995
precision Score: 1.0
Recall Score: 0.9990118577075099
F1 Score: 0.9995056846267919
Confusion Matrix: 

[[ 988    0]
 [   1 1011]]


## Hyperparameter Tuning

In [22]:
## Grid search:
model=RandomForestClassifier(n_jobs=-1)

param_grid={
    'n_estimators':[2,3],
    'max_depth': [15,20,35],
    'min_samples_split': [5,10,15],
    'min_samples_leaf': [15,25,30],
    'max_features' :['sqrt',10,20,30],
    'criterion': ['gini','entropy']
}
grid_search= GridSearchCV(model,param_grid,cv=3)
grid_search.fit(x_train,y_train)
best_estimator=grid_search.best_estimator_



In [23]:
model_predict_test(best_estimator,x_test,y_test)

Model predicted sucessfully!
Accuracy Score: 1.0
precision Score: 1.0
Recall Score: 1.0
F1 Score: 1.0
Confusion Matrix: 

[[ 988    0]
 [   0 1012]]


In [24]:
print(grid_search.best_params_)

{'criterion': 'gini', 'max_depth': 15, 'max_features': 20, 'min_samples_leaf': 25, 'min_samples_split': 5, 'n_estimators': 3}


In [26]:
results = grid_search.cv_results_
results_df = pd.DataFrame(results)
for i in range(len(results_df)):
    print(f"Parameters: {results_df.iloc[i]['params']}")
    print(f"Mean Test Score: {results_df.iloc[i]['mean_test_score']:.4f}")
    print(f"Std Test Score: {results_df.iloc[i]['std_test_score']:.4f}")
    print(f"Mean Fit Time: {results_df.iloc[i]['mean_fit_time']:.4f} seconds")
    print(f"Mean Score Time: {results_df.iloc[i]['mean_score_time']:.4f} seconds")
    print("-" * 60)

Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 15, 'min_samples_split': 5, 'n_estimators': 2}
Mean Test Score: 0.9881
Std Test Score: 0.0078
Mean Fit Time: 0.0218 seconds
Mean Score Time: 0.0088 seconds
------------------------------------------------------------
Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 15, 'min_samples_split': 5, 'n_estimators': 3}
Mean Test Score: 0.9944
Std Test Score: 0.0048
Mean Fit Time: 0.0292 seconds
Mean Score Time: 0.0135 seconds
------------------------------------------------------------
Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 15, 'min_samples_split': 10, 'n_estimators': 2}
Mean Test Score: 0.9963
Std Test Score: 0.0043
Mean Fit Time: 0.0289 seconds
Mean Score Time: 0.0069 seconds
------------------------------------------------------------
Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_feature

In [27]:
#2:
model2=RandomForestClassifier(n_jobs=-1,criterion= 'gini',max_depth= 15, max_features = 'sqrt',  min_samples_leaf= 15,  min_samples_split= 5, n_estimators= 2)
model2.fit(x_train,y_train)



In [28]:
model_predict_test(model2,x_test,y_test)

Model predicted sucessfully!
Accuracy Score: 0.999
precision Score: 1.0
Recall Score: 0.9980237154150198
F1 Score: 0.9990108803165183
Confusion Matrix: 

[[ 988    0]
 [   2 1010]]


## Final Model


In [114]:
model3=RandomForestClassifier(n_jobs=-1,criterion= 'gini',max_depth= 12, max_features = 'sqrt',  min_samples_leaf= 14,  min_samples_split= 5, n_estimators= 2)
model3.fit(x_train,y_train)


In [117]:
model_predict_test(model3,x_test,y_test)

Model predicted sucessfully!
Accuracy Score: 0.9865
precision Score: 0.9871414441147379
Recall Score: 0.9861660079051383
F1 Score: 0.986653484923381
Confusion Matrix: 

[[975  13]
 [ 14 998]]


In [122]:
joblib.dump(model3,'phising_detection_system.joblib')
#the model will be attached with submission

['phising_detection_system.joblib']