In [64]:
import numpy as n
import pandas as p
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.model_selection import GridSearchCV

In [65]:
dataset=p.read_csv('Dataset.txt',delimiter='\t')
testdataset=p.read_csv('Dataset_test.txt',delimiter='\t')



In [66]:
for coll in ['F15','F16']:
    dataset[coll]=p.to_datetime(dataset[coll],errors='coerce')
    testdataset[coll]=p.to_datetime(testdataset[coll],errors='coerce')
    #diff in days
    dataset[f'{coll}_diff']=(p.Timestamp.now()-dataset[coll]).dt.days
    testdataset[f'{coll}_diff']=(p.Timestamp.now()-testdataset[coll]).dt.days


In [67]:
dataset.drop(columns=['F15','F16'],inplace=True)
testdataset.drop(columns=['F15','F16'],inplace=True)

In [68]:
label={}
for coll in dataset.select_dtypes(include=['object']).columns:
    if coll!='Index':
        l=LabelEncoder()
        dataset[coll]=l.fit_transform(dataset[coll])
        testdataset[coll]=l.transform(testdataset[coll])
        label[coll]=l

In [69]:
x=dataset.drop(columns=['Index','C'])
y=dataset['C']

In [70]:
s=StandardScaler()
x=s.fit_transform(x)
testdataset_scaled=s.transform(testdataset.drop(columns=['Index']))


In [71]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)


In [72]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [73]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='roc_auc', n_jobs=-1)


In [74]:
grid_search.fit(xtrain, ytrain)



KeyboardInterrupt: 

In [51]:
best=grid_search.best_estimator_

In [53]:
ytestval=best.predict(xtest)
ytest_prob=best.predict_proba(xtest)[:,1]

In [54]:
print('accuracy:',accuracy_score(ytest,ytestval))

accuracy 0.7546451867958095


In [55]:
print('classification_report:',classification_report(ytest,ytestval))

classification_report:               precision    recall  f1-score   support

           0       0.75      1.00      0.86     15271
           1       0.00      0.00      0.00      4965

    accuracy                           0.75     20236
   macro avg       0.38      0.50      0.43     20236
weighted avg       0.57      0.75      0.65     20236



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [56]:
gb=GradientBoostingClassifier(n_estimators=200,learning_rate=0.1,max_depth=5,random_state=42)
gb.fit(xtrain,ytrain)

In [58]:
ytestval_gb=gb.predict(xtest)
ytestval_prob_gb=gb.predict_proba(xtest)[:,1]


In [59]:
print("gradient boost accuracy:",accuracy_score(ytest,ytestval_gb))


gradient boost accuracy: 0.7607234631350069


In [60]:
print("gradient boost classification report:",classification_report(ytest,ytestval_gb))

gradient boost classification report:               precision    recall  f1-score   support

           0       0.76      0.99      0.86     15271
           1       0.63      0.06      0.11      4965

    accuracy                           0.76     20236
   macro avg       0.70      0.52      0.48     20236
weighted avg       0.73      0.76      0.68     20236



In [62]:
ytest_train=best.predict(x)
pred_train=p.DataFrame({'Index':dataset['Index'], "Class": ytest_train})
pred_train.to_csv('predictions_train.txt',sep='\t',index=False)


In [63]:
y_pred_test=best.predict(testdataset_scaled)
pred_test=p.DataFrame({'Index': testdataset['Index'],'Class':y_pred_test})
pred_test.to_csv('predictions_test.txt',sep='\t',index=False)

In [None]:
with open('training_script.py','w') as f:
    f.write(open())