#### **Libraries**

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from termcolor import colored, cprint

#### **Datasets and Analytics**

In [10]:
data1 = pd.read_csv(r'C:\Users\AleynaCihangir\Desktop\MarketPlace PowerBrands Projects\00_Literature\dataset_1.csv')
print(colored('Datasets Shape: ','blue'), data1.shape)

[34mDatasets Shape: [0m (50000, 301)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    data1.drop(labels=['target'], axis=1),
    data1['target'],
    test_size=0.3,
    random_state=0)
print(colored('Shape for X train and X test:', 'blue'), X_train.shape, X_test.shape)

[34mShape for X train and X test:[0m (35000, 300) (15000, 300)


In [12]:
X_train_original = X_train.copy()
X_test_original = X_test.copy()
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
print(colored('Shape for X train and X test:', 'blue'), X_train.shape, X_test.shape)

[34mShape for X train and X test:[0m (35000, 266) (15000, 266)


In [13]:
sel = VarianceThreshold(threshold=0.01)  
sel.fit(X_train)  
print(colored('Sum of Support:', 'blue'),sum(sel.get_support()))
features_to_keep = X_train.columns[sel.get_support()]
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
print(colored('Shape for X train and X test:', 'blue'), X_train.shape, X_test.shape)
X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep
X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

[34mSum of Support:[0m 215
[34mShape for X train and X test:[0m (35000, 215) (15000, 215)


In [14]:
duplicated_feat = []
for i in range(0, len(X_train.columns)):
    if i % 10 == 0: 
        print(i)
    col_1 = X_train.columns[i]
    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)          
len(duplicated_feat)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210


10

In [15]:
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)
print(colored('Shape for X train and X test:', 'blue'), X_train.shape, X_test.shape)
X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()
def correlation(dataset, threshold):
    col_corr = set() 
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]  
                col_corr.add(colname)
    return col_corr
corr_features = correlation(X_train, 0.8)
print(colored('correlated features: ', 'blue'), len(set(corr_features)))
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)
print(colored('Shape for X train and X test:', 'blue'), X_train.shape, X_test.shape)

[34mShape for X train and X test:[0m (35000, 205) (15000, 205)
[34mcorrelated features: [0m 93
[34mShape for X train and X test:[0m (35000, 112) (15000, 112)


In [16]:
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=50, random_state=10))
sel_.fit(X_train, y_train)
X_train_rf = pd.DataFrame(sel_.transform(X_train))
X_test_rf = pd.DataFrame(sel_.transform(X_test))
X_train_rf.columns = X_train.columns[(sel_.get_support())]
X_test_rf.columns = X_train.columns[(sel_.get_support())]
print(colored('Shape for X train_rf and X test_rf:', 'blue'), X_train_rf.shape, X_test_rf.shape)

[34mShape for X train_rf and X test_rf:[0m (35000, 16) (15000, 16)


In [17]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print(colored('Train set', 'red'))
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print(colored('Test set', 'red'))
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [18]:
run_randomForests(X_train_original, X_test_original, y_train, y_test)

[31mTrain set[0m
Random Forests roc-auc: 0.807612232524249
[31mTest set[0m
Random Forests roc-auc: 0.7868832427636059


In [19]:
run_randomForests(X_train_basic_filter, X_test_basic_filter, y_train, y_test)

[31mTrain set[0m
Random Forests roc-auc: 0.810290026780428
[31mTest set[0m
Random Forests roc-auc: 0.7914020645941601


In [20]:
run_randomForests(X_train_corr, X_test_corr, y_train, y_test)

[31mTrain set[0m
Random Forests roc-auc: 0.8066004772684517
[31mTest set[0m
Random Forests roc-auc: 0.7859521124929707


In [21]:
run_randomForests(X_train_corr, X_test_corr, y_train, y_test)

[31mTrain set[0m
Random Forests roc-auc: 0.8066004772684517
[31mTest set[0m
Random Forests roc-auc: 0.7859521124929707


In [22]:
run_randomForests(X_train_rf, X_test_rf, y_train, y_test)

[31mTrain set[0m
Random Forests roc-auc: 0.825594244784318
[31mTest set[0m
Random Forests roc-auc: 0.8037861254524954
