In [1]:
import pandas as pd
import warnings
warnings.simplefilter('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,f1_score,precision_score,accuracy_score
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt

micro=pd.read_csv('micro_train.csv')


micro.drop(columns=['MachineIdentifier',"OsBuildLab"],axis=1,inplace=True)
for col in micro.columns:
    if micro[col].isnull().sum() > 30000:
        micro.drop(col,axis=1,inplace=True)

y=micro['HasDetections']


micro.drop(['HasDetections'],axis=1,inplace=True)


cat_list=[]
con_list=[]

for col in micro.columns:
    if micro[col].dtype == 'int64' or micro[col].dtype == 'float64':
        con_list.append(col)
    else:
        cat_list.append(col)

for col in micro[cat_list].columns:
    micro[col].fillna(value=micro[col].mode()[0],inplace=True)
    
for col in micro[con_list].columns:
    micro[col].fillna(value=micro[col].median(),inplace=True)


"""from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = [[con_list[i],variance_inflation_factor(micro[con_list].values, i)] for i in range(micro[con_list].shape[1])]

drop_col=[]
for i in vif:
    if i[1] > 20:
        drop_col.append(i[0])
        
print(drop_col)

micro.drop(drop_col,axis=1,inplace=True)"""
    
X_train,X_test,y_train,y_test=train_test_split(micro,y,test_size=0.25,random_state=30)


le=LabelEncoder()
for col in cat_list:
    le.fit(X_train[col])
    X_test[col] = X_test[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<unknown>')
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    

MScaling=MinMaxScaler()
for col in micro.columns:
    X_train[col]=MScaling.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_test[col]=MScaling.transform(np.array(X_test[col]).reshape(-1,1))
"""
SScaling=StandardScaler()
for col in micro.columns:
    X_train[col]=SScaling.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_test[col]=SScaling.transform(np.array(X_test[col]).reshape(-1,1))
"""

#LogisticRegression model 

log_reg=RandomForestClassifier()
log_reg.fit(X_train,y_train)
log_reg.predict(X_test)


print(f'confussion matrix =\n {confusion_matrix(y_test,log_reg.predict(X_test))}')

print(f'f1_score = {f1_score(y_test,log_reg.predict(X_test))}')

y_pre_pro=log_reg.predict_proba(X_test)[::,1]

fpr,tpr,threshold=roc_curve(y_test,y_pre_pro)
auc_score=auc(fpr,tpr)
auc_score

confussion matrix =
 [[4255 1886]
 [3222 3137]]
f1_score = 0.5512212264979792


0.6360045918862388

In [2]:
#decision model

from sklearn.tree import DecisionTreeClassifier

Decision_class=DecisionTreeClassifier(max_depth=8,min_samples_split= 8,min_samples_leaf = 8,max_leaf_nodes= 5,criterion='gini')
Decision_class.fit(X_train,y_train)
Decision_class.predict(X_test)



print(f'confussion matrix =\n {confusion_matrix(y_test,Decision_class.predict(X_test))}')


print(f'Decision_class.score= {Decision_class.score(X_train,y_train)}')


print(f'f1_score = {f1_score(y_test,Decision_class.predict(X_test))}')

y_decesion_pro=pd.DataFrame(Decision_class.predict_proba(X_test)[::1])

fpr,tpr,threshold=roc_curve(y_test,y_decesion_pro[1])
auc_score_decesion=auc(fpr,tpr)
auc_score_decesion

confussion matrix =
 [[3171 2970]
 [1934 4425]]
Decision_class.score= 0.6107896210565615
f1_score = 0.643449178420823


0.6399308932849439

In [3]:
from sklearn.model_selection import RandomizedSearchCV

#parameters tuning for DecisionTreeClassifier

Param_dict_RSC = {
'max_depth':[4,5,6,7,8,9,10],
'min_samples_split':[2,3,4,5,6,7,8,9,10],
'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
'max_leaf_nodes':[4,5,6,7,8,9,10],
"criterion": ["gini", "entropy"]}

rcv_DecisionTree = RandomizedSearchCV(Decision_class,Param_dict_RSC,cv=5,n_iter=15)
rcv_DecisionTree.fit(X_train,y_train)
rcv_DecisionTree.best_params_

{'min_samples_split': 9,
 'min_samples_leaf': 6,
 'max_leaf_nodes': 10,
 'max_depth': 9,
 'criterion': 'gini'}

In [4]:
from sklearn.ensemble import RandomForestClassifier

RandomForest=RandomForestClassifier(n_estimators=70,min_samples_split=8,min_samples_leaf=5,max_leaf_nodes=8,max_features=3,max_depth=5)

RandomForest.fit(X_train,y_train)
RandomForest.predict(X_test)


print(f'confussion matrix =\n {confusion_matrix(y_test,RandomForest.predict(X_test))}')
print(f'RandomForest.score= {RandomForest.score(X_train,y_train)}')
print(f'f1_score = {f1_score(y_test,RandomForest.predict(X_test))}')
print(f'precision_score = {precision_score(y_test,RandomForest.predict(X_test))}')


confussion matrix =
 [[3583 2558]
 [2279 4080]]
RandomForest.score= 0.6206832182191525
f1_score = 0.6278371931984303
precision_score = 0.6146429647484182


In [5]:
#parameters tuning for RandomForestClassifier

parameters_Random={'n_estimators':[70,80,100],
                   'max_depth':[5,6,7,8], 
                   "min_samples_split":[2,3,4,5,6,7,8,9,10], 
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10], 
                   'max_features':['auto','log2',3,4,5], 
                   'max_leaf_nodes':[4,5,6,7,8,9,10]}

rcv_RandomForestClassifier=RandomizedSearchCV(RandomForest,parameters_Random,cv=5,n_iter=15)
rcv_RandomForestClassifier.fit(X_train,y_train)
rcv_RandomForestClassifier.best_params_

{'n_estimators': 80,
 'min_samples_split': 3,
 'min_samples_leaf': 5,
 'max_leaf_nodes': 10,
 'max_features': 3,
 'max_depth': 8}

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

GBC=GradientBoostingClassifier(learning_rate=0.075,min_samples_split=4,
                                max_depth=5,min_samples_leaf=8,max_features=5,max_leaf_nodes=9)

GBC.fit(X_train,y_train)
GBC.predict(X_test)


print(f'confussion matrix =\n {confusion_matrix(y_test,GBC.predict(X_test))}')
print(f'RandomForest.score= {GBC.score(X_train,y_train)}')
print(accuracy_score(y_test,GBC.predict(X_test)))
print(f'f1_score = {f1_score(y_test,GBC.predict(X_test))}')
print(f'precision_score = {precision_score(y_test,GBC.predict(X_test))}')



confussion matrix =
 [[3859 2282]
 [2440 3919]]
RandomForest.score= 0.6408170884556922
0.62224
f1_score = 0.6240445859872611
precision_score = 0.6319948395420093


In [7]:
from sklearn.model_selection import RandomizedSearchCV
parameters_Random_gra={'learning_rate':[0.05,0.075,0.1,0.125],
                   'max_depth':[5,6,7,8], 
                   "min_samples_split":[2,3,4,5,6,7,8,9,10], 
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10], 
                   'max_features':['auto','log2',3,4,5], 
                   'max_leaf_nodes':[4,5,6,7,8,9,10]}

rcv_GradientBoostingClassifier=RandomizedSearchCV(GBC,parameters_Random_gra,cv=5,n_iter=15)
rcv_GradientBoostingClassifier.fit(X_train,y_train)
rcv_GradientBoostingClassifier.best_params_

{'min_samples_split': 3,
 'min_samples_leaf': 7,
 'max_leaf_nodes': 8,
 'max_features': 'log2',
 'max_depth': 6,
 'learning_rate': 0.1}