In [2]:
#imports
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint

In [3]:
#load the training data
x_train = pd.read_csv('train_data/tfidf_train_train.csv', index_col=[0])
y_train = pd.read_csv('train_data/y_train_train.csv', index_col=[0])
#load the test data
x_test = pd.read_csv('train_data/tfidf_train_test.csv', index_col=[0])
y_test = pd.read_csv('train_data/y_train_test.csv', index_col=[0])

In [4]:
# solving problems with index after loading data
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [5]:
def show_results(model, x, y):
    predict = model.predict(x)
    print('F1 score:',f1_score(y, predict))
    print('Accuracy:',accuracy_score(y, predict))
    print('Recall:',recall_score(y, predict))
    print('Precision:',precision_score(y, predict))
    print('ROC AUC:', roc_auc_score(y, model.predict_proba(x)[:, 1]))

## Regresja logistyczna

In [6]:
# no parameters
log_reg=LogisticRegression()
log_reg.fit(x_train, y_train.values.ravel())
print('Logistic Regression:')
print()
print('Train results:')
show_results(log_reg, x_train, y_train)
print()
print('Test results:')
show_results(log_reg, x_test, y_test)

Logistic Regression:

Train results:
F1 score: 0.9434208506064793
Accuracy: 0.9553170850006063
Recall: 0.929154695297142
Precision: 0.9581319195384376
ROC AUC: 0.9905163082141264

Test results:
F1 score: 0.9350767275038592
Accuracy: 0.9494234986206409
Recall: 0.9179889463362453
Precision: 0.9528127313101407
ROC AUC: 0.9867215589592845


Regresja logistyczna radzi sobie całkiem dobrze, dlatego też nie będziemy dobierać do niej hiperparametrów.

## Decision Tree

In [7]:
#Decision Tree
tree=DecisionTreeClassifier()  
tree.fit(x_train,y_train.values.ravel())
print('Decision Tree:')
print()
print('Train results:')
show_results(tree, x_train, y_train)
print()
print('Test results:')
show_results(tree, x_test, y_test)

Decision Tree:

Train results:
F1 score: 0.999886574161594
Accuracy: 0.9999090578392142
Recall: 0.9997731740511114
Precision: 1.0
ROC AUC: 0.9999999827831736

Test results:
F1 score: 0.9118121527155404
Accuracy: 0.9303954162835113
Recall: 0.9069352825815653
Precision: 0.91674175527122
ROC AUC: 0.9267012828150417


Porównując wyniki zbioru treningowego i testowego widzimy, że drzewo się przeucza. Spróbujmy dobrać hiperparametry.

In [12]:
param_distributions = {
    'max_depth': randint(low=1, high=20),
    'min_samples_leaf': randint(low=1, high=10),
    'min_samples_split': randint(low=2, high=10)
}

tree2 = DecisionTreeClassifier()
search = RandomizedSearchCV(tree2, param_distributions, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1)
search.fit(x_train, y_train)
print(search.best_params_)

{'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 2}


Spróbujmy teraz użyć tych parametrów do nowego modelu.

In [16]:
print('Decision Tree with parameters:')
print()
print('Train results:')
show_results(search, x_train, y_train)
print()
print('Test results:')
show_results(search, x_test, y_test)

Decision Tree with parameters:

Train results:
F1 score: 0.9539291157047399
Accuracy: 0.9642597308112041
Recall: 0.922879177377892
Precision: 0.9871411241407198
ROC AUC: 0.9778156505022808

Test results:
F1 score: 0.9198741672834937
Accuracy: 0.9387423074202448
Recall: 0.8862542342663576
Precision: 0.9561454125793422
ROC AUC: 0.937443575170834


Dzięki wprowadzonym zmianom udało nam się uniknąć przeuczenia modelu i doprowadziliśmy do poprawienia wyników.

## Random Forest

In [15]:
rand_forest=RandomForestClassifier()
rand_forest.fit(x_train,y_train.values.ravel())
print('Random Forest:')
print()
print('Train results:')
show_results(rand_forest, x_train, y_train)
print()
print('Test results:')
show_results(rand_forest, x_test, y_test)

Random Forest:

Train results:
F1 score: 0.9998865827378927
Accuracy: 0.9999090578392142
Recall: 0.999848782700741
Precision: 0.9999243856332704
ROC AUC: 0.9999999751312508

Test results:
F1 score: 0.9332960164194422
Accuracy: 0.9494234986206409
Recall: 0.8917810661436977
Precision: 0.9788649706457926
ROC AUC: 0.9885079619426965


Patrząc na wyniki ponownie możemy podejrzewać, że model się przeuczył. Spróbujmy dobrać hiperparametry, aby poprawić jakość modelu. Zastosujemy do tego RandomizedSearch i crosswalidację.

In [6]:
param_dist = {
    'max_depth': [10, 20, 30, 40, 50]
}
rf = RandomForestClassifier()
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=5,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    random_state=42
)
random_search.fit(x_train, y_train.values.ravel())
print(random_search.best_params_)



{'max_depth': 50}


In [7]:
print('Random Forest with parameters:')
print()
print('Train results:')
show_results(random_search, x_train, y_train)
print()
print('Test results:')
show_results(random_search, x_test, y_test)

Random Forest with parameters:

Train results:
F1 score: 0.9966240564427417
Accuracy: 0.9973020492300231
Recall: 0.9932708301829729
Precision: 1.0
ROC AUC: 0.999985168660546

Test results:
F1 score: 0.9349706949483673
Accuracy: 0.9505552804696895
Recall: 0.8958816188268853
Precision: 0.9776264591439688
ROC AUC: 0.9876625825320269


Jak widać nie zaszła większa poprawa w porównaniu do Random Forest bez hiperparametrów.