<a href="https://colab.research.google.com/github/annaantt/techlabs/blob/master/Model_Building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import relevant libraries

In [1]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Train test split
from sklearn.model_selection import train_test_split

# Pipeline, grid, CV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report

In [2]:
# Load the dataset
df = pd.read_csv('df_Final 3.csv', sep=';')

In [3]:
df.head()

Unnamed: 0,Date,Compound_Score,Total Volume of Tweets,Count_Negatives,Count_Positives,Count_Neutrals,Sent_Negatives,Sent_Positives,Count_News,Count_Bots,Open,High,Low,Close,Volume (BTC),Volume (Currency)
0,2017-08-01 00:00:00,0.082893,1027.0,139.0,347.0,397.0,-0.51662,0.417882,695.0,144.0,2855.81,2863.06,2823.0,2825.92,184.02,52295100.0
1,2017-08-01 01:00:00,0.05316,778.0,111.0,193.0,320.0,-0.423402,0.415385,493.0,154.0,2823.01,2860.02,2821.01,2853.38,77.3,219605.16
2,2017-08-01 02:00:00,0.124251,836.0,89.0,273.0,264.0,-0.440938,0.428661,510.0,210.0,2846.27,2858.04,2837.31,2841.6,135.83,386739.15
3,2017-08-01 03:00:00,-0.021037,984.0,250.0,236.0,314.0,-0.441173,0.396034,683.0,184.0,2841.84,2863.88,2837.73,2862.93,143.2,408360.03
4,2017-08-01 04:00:00,0.055437,751.0,114.0,195.0,305.0,-0.414978,0.417159,517.0,137.0,2862.92,2876.0,2848.11,2874.99,222.53,637045.88


In [4]:
def newTrend(row):
  trend = (row["Close"] - row["Open"])
  if trend >0:
    return 1
  else:
    return 0

df['trend'] = df.apply(lambda row: newTrend(row), axis=1)
df.head()

Unnamed: 0,Date,Compound_Score,Total Volume of Tweets,Count_Negatives,Count_Positives,Count_Neutrals,Sent_Negatives,Sent_Positives,Count_News,Count_Bots,Open,High,Low,Close,Volume (BTC),Volume (Currency),trend
0,2017-08-01 00:00:00,0.082893,1027.0,139.0,347.0,397.0,-0.51662,0.417882,695.0,144.0,2855.81,2863.06,2823.0,2825.92,184.02,52295100.0,0
1,2017-08-01 01:00:00,0.05316,778.0,111.0,193.0,320.0,-0.423402,0.415385,493.0,154.0,2823.01,2860.02,2821.01,2853.38,77.3,219605.16,1
2,2017-08-01 02:00:00,0.124251,836.0,89.0,273.0,264.0,-0.440938,0.428661,510.0,210.0,2846.27,2858.04,2837.31,2841.6,135.83,386739.15,0
3,2017-08-01 03:00:00,-0.021037,984.0,250.0,236.0,314.0,-0.441173,0.396034,683.0,184.0,2841.84,2863.88,2837.73,2862.93,143.2,408360.03,1
4,2017-08-01 04:00:00,0.055437,751.0,114.0,195.0,305.0,-0.414978,0.417159,517.0,137.0,2862.92,2876.0,2848.11,2874.99,222.53,637045.88,1


In [5]:
dataTypeSeries = df.dtypes
print(dataTypeSeries)

Date                       object
Compound_Score            float64
Total Volume of Tweets    float64
Count_Negatives           float64
Count_Positives           float64
Count_Neutrals            float64
Sent_Negatives            float64
Sent_Positives            float64
Count_News                float64
Count_Bots                float64
Open                      float64
High                      float64
Low                       float64
Close                     float64
Volume (BTC)               object
Volume (Currency)          object
trend                       int64
dtype: object


In [28]:
ds = df.isin([np.inf, -np.inf])
print(ds)

c = np.isinf(df['Count_Negatives']).values.sum()
print("It contains " + str(c) + " infinite values")

        Date  Compound_Score  ...  Volume (Currency)  trend
0      False           False  ...              False  False
1      False           False  ...              False  False
2      False           False  ...              False  False
3      False           False  ...              False  False
4      False           False  ...              False  False
...      ...             ...  ...                ...    ...
12931  False           False  ...              False  False
12932  False           False  ...              False  False
12933  False           False  ...              False  False
12934  False           False  ...              False  False
12935  False           False  ...              False  False

[12936 rows x 17 columns]
It contains 0 infinite values


In [29]:

#df.dropna()

df['Count_Negatives'].isnull().sum()

df['trend'].isnull().sum()



0

In [30]:
df['Count_Negatives'].max()

3085.0

## Dataset

## Variables (Baseline Model)

In [24]:
# Independent variables/features

X = np.array(df["trend"])

# Dependent/target variable
y = np.array(df['Count_Negatives'])

## Train test split (Baseline Model)

In [25]:
# Leave out the last 20% of the time-series data for test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)

## Dummy classifier (Baseline Model)

In [26]:
dummy_clf = DummyClassifier(strategy="uniform")
dummy_clf.fit(X_train, y_train)

print("Training set score: {:.3f}".format(dummy_clf.score(X_train, y_train)))
print("Test set score: {:.3f}".format(dummy_clf.score(X_test, y_test)))

ValueError: ignored

In [None]:
y_pred = dummy_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.54      0.56       141
           1       0.50      0.55      0.52       119

    accuracy                           0.54       260
   macro avg       0.54      0.54      0.54       260
weighted avg       0.55      0.54      0.54       260



## Classification models

- Logistic Regression
- Support Vector Machines (SVM)
- MLP
- Decision Tree (tree-based)
- Random Forest (tree-based)
- Gradient Boosting (tree-based)
- KNN

## Variables

In [None]:
# Independent variables/features

X = np.array(df[["score_p", "score_c", "negative_c", "negative_p", "positive_c", "positive_p",
                "count_c", "count_p"]])

# Dependent/target variable
y = np.array(df['trend'])

## Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, shuffle=True)

#### Logistic Regression

In [None]:
pipeLogR = Pipeline([("scaler", MinMaxScaler()), ("logreg", LogisticRegression(max_iter=10000, random_state=0))])

param_grid = {"logreg__C": [0.0001, 0.001, 0.01, 0.5, 1.0, 10.0, 100, 1000],
             "scaler": [MinMaxScaler(), StandardScaler(), None]}

grid = GridSearchCV(pipeLogR, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

print("Training accuracy: {:.3f}".format(grid.score(X_train, y_train)))
print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Training accuracy: 0.564
Best cross-validation accuracy: 0.536
Test set score: 0.529
Best parameters: {'logreg__C': 0.01, 'scaler': StandardScaler()}


In [None]:
y_pred = grid.predict(X_test)
print(y_pred[:250])
print(classification_report(y_test, y_pred))

[0 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 0
 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1
 0 0 0 1 1 1 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 1 0 1 1 1 1 0 0 0 0
 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1
 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 0 0 1 0 0 0 0
 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1
 1 1 1 1 1 0 1 1 1 1 0 1 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 0]
              precision    recall  f1-score   support

           0       0.59      0.57      0.58       184
           1       0.46      0.48      0.47       141

    accuracy                           0.53       325
   macro avg       0.52      0.52      0.52       325
weighted avg       0.53      0.53      0.53       325



#### MLP

In [None]:
pipeMLP = Pipeline([("scaler", MinMaxScaler()), ("MLP", MLPClassifier(max_iter=10000, random_state=0))])

param_grid = {"MLP__alpha": [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
              "MLP__hidden_layer_sizes": [(50, 50), (50, 100), (100, 50), (100, 100)],
             "scaler": [MinMaxScaler(), StandardScaler(), None]}

grid = GridSearchCV(pipeMLP, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.548
Test set score: 0.563
Best parameters: {'MLP__alpha': 0.1, 'MLP__hidden_layer_sizes': (50, 100), 'scaler': None}


In [None]:
y_pred = grid.predict(X_test)
print(y_pred[:250])
print(classification_report(y_test, y_pred))

[0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1
 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 1 0 1
 1 0 0 1 0 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0
 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 1 1 1
 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 1]
              precision    recall  f1-score   support

           0       0.62      0.58      0.60       184
           1       0.50      0.55      0.52       141

    accuracy                           0.56       325
   macro avg       0.56      0.56      0.56       325
weighted avg       0.57      0.56      0.56       325



#### Support Vector Machines

In [None]:
pipeSVC = Pipeline([("scaler", MinMaxScaler()), ("SVC", SVC(random_state=0))])

param_grid = {"SVC__gamma": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
             "SVC__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
              "scaler": [MinMaxScaler(), StandardScaler(), None]}

grid_model = GridSearchCV(pipeSVC, param_grid=param_grid, cv=5)

grid_model.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.3f}".format(grid_model.best_score_))
print("Test set score: {:.3f}".format(grid_model.score(X_test, y_test)))
print("Best parameters: {}".format(grid_model.best_params_))

Best cross-validation accuracy: 0.548
Test set score: 0.502
Best parameters: {'SVC__C': 100, 'SVC__gamma': 0.001, 'scaler': StandardScaler()}


In [None]:
y_pred = grid.predict(X_test)
print(y_pred[:250])
print(classification_report(y_test, y_pred))

[0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1
 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 1 0 1
 1 0 0 1 0 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0
 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 1 1 1
 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 1]
              precision    recall  f1-score   support

           0       0.62      0.58      0.60       184
           1       0.50      0.55      0.52       141

    accuracy                           0.56       325
   macro avg       0.56      0.56      0.56       325
weighted avg       0.57      0.56      0.56       325



###### Decision Tree

In [None]:
pipeDT = Pipeline([("scaler", None), ("DT", DecisionTreeClassifier(random_state=0))])

param_grid = {"DT__max_depth": [2, 3, 4, 5, 6, 7, 8],
             "scaler": [None]}

grid = GridSearchCV(pipeDT, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.550
Test set score: 0.557
Best parameters: {'DT__max_depth': 6, 'scaler': None}


In [None]:
y_pred = grid.predict(X_test)
print(y_pred[:250])
print(classification_report(y_test, y_pred))

[0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 0 0 1 0 1 1 1 0 0
 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1
 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 1
 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1
 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 1
 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1]
              precision    recall  f1-score   support

           0       0.61      0.61      0.61       184
           1       0.49      0.48      0.49       141

    accuracy                           0.56       325
   macro avg       0.55      0.55      0.55       325
weighted avg       0.56      0.56      0.56       325



#### Random Forest

In [None]:
pipeRF = Pipeline([("scaler", None), ("RF", RandomForestClassifier(random_state=0))])

param_grid = {"RF__max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             "RF__n_estimators": [25, 50, 75, 100],
             "scaler": [None]}

grid = GridSearchCV(pipeRF, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.552
Test set score: 0.551
Best parameters: {'RF__max_depth': 10, 'RF__n_estimators': 75, 'scaler': None}


In [None]:
y_pred = grid.predict(X_test)
print(y_pred[:250])
print(classification_report(y_test, y_pred))

[0 0 1 1 0 1 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0
 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0
 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1]
              precision    recall  f1-score   support

           0       0.60      0.64      0.62       184
           1       0.48      0.44      0.46       141

    accuracy                           0.55       325
   macro avg       0.54      0.54      0.54       325
weighted avg       0.55      0.55      0.55       325



##### Gradient Boosting

In [None]:
pipeGB = Pipeline([("scaler", None), ("GB", GradientBoostingClassifier(random_state=0))])

param_grid = {"GB__n_estimators": [25, 50, 75, 100], "GB__learning_rate": [0.0001, 0.001, 0.01, 0.1],
             "scaler": [None]}

grid = GridSearchCV(pipeGB, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.550
Test set score: 0.572
Best parameters: {'GB__learning_rate': 0.01, 'GB__n_estimators': 50, 'scaler': None}


In [None]:
y_pred = grid.predict(X_test)
print(y_pred[:250])
print(classification_report(y_test, y_pred))

[0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 0 0 1 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0
 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 0
 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1
 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1
 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1]
              precision    recall  f1-score   support

           0       0.61      0.69      0.65       184
           1       0.51      0.42      0.46       141

    accuracy                           0.57       325
   macro avg       0.56      0.55      0.55       325
weighted avg       0.56      0.57      0.57       325



###### KNN

In [None]:
pipeKNN = Pipeline([("scaler", MinMaxScaler()), ("KNN", KNeighborsClassifier())])

param_grid = {"KNN__n_neighbors": [3, 5, 7, 9, 11, 13, 15],
             "scaler": [MinMaxScaler(), StandardScaler(), None]}

grid = GridSearchCV(pipeKNN, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.546
Test set score: 0.477
Best parameters: {'KNN__n_neighbors': 13, 'scaler': None}


In [None]:
y_pred = grid.predict(X_test)
print(y_pred[:250])
print(classification_report(y_test, y_pred))

[1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 0 0 1 1 1 0
 0 0 1 0 0 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1
 0 1 0 1 1 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 1
 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 1 1
 1 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 0 0 0
 0 1 1 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 1
 0 1 1 1 0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1]
              precision    recall  f1-score   support

           0       0.56      0.36      0.44       184
           1       0.43      0.62      0.51       141

    accuracy                           0.48       325
   macro avg       0.49      0.49      0.47       325
weighted avg       0.50      0.48      0.47       325



## Automatic feature selection

In [None]:
from sklearn.feature_selection import RFE

select = RFE(RandomForestClassifier(n_estimators=50, max_depth=5, random_state=0), n_features_to_select=3)

select.fit(X_train, y_train)

RFE(estimator=RandomForestClassifier(max_depth=5, n_estimators=50,
                                     random_state=0),
    n_features_to_select=3)

In [None]:
select.score(X_test, y_test)

0.5784615384615385

In [None]:
for i in range(X.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, select.support_[i], select.ranking_[i]))

Column: 0, Selected False, Rank: 5.000
Column: 1, Selected True, Rank: 1.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected False, Rank: 3.000
Column: 4, Selected False, Rank: 2.000
Column: 5, Selected True, Rank: 1.000
Column: 6, Selected False, Rank: 4.000
Column: 7, Selected False, Rank: 6.000


In [None]:
from sklearn.feature_selection import RFE

select = RFE(GradientBoostingClassifier(n_estimators=50, learning_rate=0.01, random_state=0), n_features_to_select=3)

select.fit(X_train, y_train)

RFE(estimator=GradientBoostingClassifier(learning_rate=0.01, n_estimators=50,
                                         random_state=0),
    n_features_to_select=3)

In [None]:
select.score(X_test, y_test)

0.5753846153846154

In [None]:
for i in range(X.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, select.support_[i], select.ranking_[i]))

Column: 0, Selected False, Rank: 4.000
Column: 1, Selected True, Rank: 1.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected True, Rank: 1.000
Column: 4, Selected False, Rank: 6.000
Column: 5, Selected False, Rank: 2.000
Column: 6, Selected False, Rank: 5.000
Column: 7, Selected False, Rank: 3.000
