### Import Libraries afddhda

In [77]:
import pandas as pd
import numpy as np
import re

In [3]:
dataset = pd.read_csv('finished_without_irregular.csv')

In [4]:
train_dataset =dataset[dataset['Date'] < '2015-01-02']
test_dataset = dataset[dataset['Date'] >= '2015-01-02']

### Vectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english',
                             min_df=0.02, 
                             max_df=0.70, 
                             max_features=20000, 
                             ngram_range=(1,3))


In [7]:
X_train_vec = count_vectorizer.fit_transform(X_train_dataset)
X_test_vec = count_vectorizer.transform(X_test_dataset)


### Feature Selection

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

dt = DecisionTreeClassifier()
dt_train = dt.fit(X_train_vec,y_train_dataset)

optimal = SelectFromModel(dt_train, prefit=True)
X_reduced_train_vec=optimal.transform(X_train_vec)
X_reduced_test_vec=optimal.transform(X_test_vec)

### Naive Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

model = MultinomialNB()
mnb_param = {
  'alpha': np.linspace(1, 5, 1),
  'fit_prior': [True, False],  
}



best_mnb = GridSearchCV(estimator = model, 
                    cv = 10,
                    scoring='roc_auc',
                    param_grid = mnb_param)


# train the model using all training set
best_mnb.fit(X_reduced_train_vec, y_train_dataset)
print("best score for mnb with training data is "+str(best_mnb.best_score_))

best score for mnb with training data is 0.56678321283865


### Decision Tree

In [45]:
tree = DecisionTreeClassifier(random_state = 2)

# list(range(15, 40, 2)
tree_param = {'max_leaf_nodes': [15, 20, 25, 30, 35, 40],
             'max_depth': [5, 10, 15, 20, 25, 30],
             'criterion' : ['gini', 'entropy']}


best_tree = GridSearchCV(estimator = tree, 
                    cv = 10,
                    scoring='roc_auc',
                    param_grid = tree_param)


# train the model using all training set
best_tree.fit(X_reduced_train_vec, y_train_dataset)
print("best score for tree with training data is "+str(best_tree.best_score_))

best score for tree with training data is 0.5687505655522713


### Logistic Regression

In [71]:
from sklearn.linear_model import LogisticRegression


lr_param = [
  {'penalty': ['l1'], 'C': np.logspace(0, 4, 10), 'solver': ['liblinear', 'saga']},
  {'penalty': ['l2'], 'C': np.logspace(0, 4, 10), 'solver': ['newton-cg', 'lbfgs']},
 ]

lr_model = LogisticRegression(random_state=0, max_iter=1000)

best_lr = GridSearchCV(estimator = lr_model, 
                    cv = 10,
                    scoring='roc_auc',
                    param_grid = lr_param)


# train the model using all training set
best_lr.fit(X_reduced_train_vec, y_train_dataset)
print("best score for lr with training data is "+str(best_lr.best_score_))

best score for lr with training data is 0.5769079297863945


### Random Forest

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

#create a new random forest classifier
rf = RandomForestClassifier(bootstrap=True, class_weight="balanced_subsample")

#create a dictionary of all values we want to test for n_estimators, number of trees
params_rf = {'n_estimators': [100, 200, 300, 400, 500]}

#use gridsearch to test all values for n_estimators
best_rf = GridSearchCV(rf, 
                     params_rf, 
                     cv=10, 
                     scoring='roc_auc')

#fit model to training data
best_rf.fit(X_reduced_train_vec, y_train_dataset)

### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier()

# Ada Boost Classifier
params_abc = {'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
            'n_estimators': [50, 100, 200, 300, 400, 500, 600]}

#use gridsearch to test all values for n_estimators
best_abc = GridSearchCV(abc,
                     params_abc, 
                     cv=10, 
                     scoring='roc_auc')

best_abc.fit(X_reduced_train_vec, y_train_dataset)

### Voting Classifier

In [74]:
from sklearn.ensemble import VotingClassifier
estimators=[('mnb', best_mnb), ('lr', best_lr), ('dt', best_tree)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X_reduced_train_vec, y_train_dataset)

VotingClassifier(estimators=[('mnb', GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.]), 'fit_prior': [True, False]},
       pre_dispatch='2*n_jobs',...e_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

### Model Evaluation

In [12]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score

def roc(model):
    y_predict_dataset = model.predict(X_reduced_test_vec)
    fpr, tpr, thresholds = roc_curve(y_test_dataset, y_predict_dataset, pos_label=1)
    return auc(fpr, tpr)
def accuracy(model):
    y_predict_dataset = model.predict(X_reduced_test_vec)
    accuracy = accuracy_score(y_test_dataset, y_predict_dataset, normalize=True, sample_weight=None)
    return accuracy


In [75]:
print("MNB auc: ", round(roc(best_mnb), 5))
print("Decision Tree auc: ", round(roc(best_tree), 5))
print("Logistic Regression auc", round(roc(best_lr), 5))
print("Random Forest auc: ", round(roc(best_rf), 5))
print("Ada Boost Classifier auc: ", round(roc(best_abc), 5))
print("Ensemble auc: ", round(roc(ensemble), 5))

MNB auc:  0.5068
Decision Tree auc:  0.57342
Logistic Regression auc 0.51739
Random Forest auc:  0.51714
Ada Boost Classifier auc:  0.50378
Ensemble auc:  0.52503


In [76]:
print("MNB accuracy: ", round(accuracy(best_mnb), 5))
print("Decision Tree accuracy: ", round(accuracy(best_tree), 5))
print("Logistic Regression accuracy", round(accuracy(best_lr), 5))
print("Random Forest accuracy", round(accuracy(best_rf), 5))
print("Ada Boost Classifier accuracy", round(accuracy(best_abc), 5))
print("Ensemble accuracy", round(accuracy(ensemble), 5))

MNB accuracy:  0.50794
Decision Tree accuracy:  0.57407
Logistic Regression accuracy 0.51852
Random Forest accuracy 0.52116
Ada Boost Classifier accuracy 0.50529
Ensemble accuracy 0.52646
