# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency


import random


from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

# Create y-values

In [2]:
n = 100000
A = [int(x/(0.5*n)) for x in range (n)] # Balanced Data Set
D = {'y':A}
data = pd.DataFrame(D).astype(bool)
data

Unnamed: 0,y
0,False
1,False
2,False
3,False
4,False
...,...
99995,True
99996,True
99997,True
99998,True


# Create x-features with Decreasing Levels of Accuracy, Keeping Recall and Precision Balanced

In [3]:
for x in range (100, 45, -5):
    s = 'Acc_' + str(x) + '_Pre_' + str(x) + '_Rec_' + str(x)
    noise = np.random.random(n)
    data[s] = np.where ( noise > x/100, np.logical_not(data['y']), data['y'] )
data

Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,Acc_55_Pre_55_Rec_55,Acc_50_Pre_50_Rec_50
0,False,False,False,True,False,False,True,False,False,False,True,False
1,False,False,False,False,True,True,False,False,True,True,True,True
2,False,False,True,False,False,False,False,False,True,True,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,True,True,True,False,True,True,False,False,True,False,True,False
99996,True,True,True,True,False,True,True,False,True,True,True,False
99997,True,True,True,True,True,False,True,True,False,False,False,False
99998,True,True,True,True,True,True,True,True,True,True,True,True


In [4]:
for x in range (100, 45, -5):
    s = 'Acc_' + str(x) + '_Pre_' + str(x) + '_Rec_' + str(x)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, rec, pre)
    print ()

Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

Acc_95_Pre_95_Rec_95
[[47.497  2.503]
 [ 2.555 47.445]]
94.94 94.89 94.99

Acc_90_Pre_90_Rec_90
[[45.013  4.987]
 [ 5.154 44.846]]
89.86 89.69 89.99

Acc_85_Pre_85_Rec_85
[[42.454  7.546]
 [ 7.49  42.51 ]]
84.96 85.02 84.92

Acc_80_Pre_80_Rec_80
[[39.948 10.052]
 [ 9.99  40.01 ]]
79.96 80.02 79.92

Acc_75_Pre_75_Rec_75
[[37.574 12.426]
 [12.481 37.519]]
75.09 75.04 75.12

Acc_70_Pre_70_Rec_70
[[35.007 14.993]
 [14.889 35.111]]
70.12 70.22 70.08

Acc_65_Pre_65_Rec_65
[[32.524 17.476]
 [17.625 32.375]]
64.9 64.75 64.94

Acc_60_Pre_60_Rec_60
[[29.834 20.166]
 [20.111 29.889]]
59.72 59.78 59.71

Acc_55_Pre_55_Rec_55
[[27.583 22.417]
 [22.51  27.49 ]]
55.07 54.98 55.08

Acc_50_Pre_50_Rec_50
[[25.031 24.969]
 [24.915 25.085]]
50.12 50.17 50.12



# Create x-Features with Perfect Recall but Decreasing Precision by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>\frac{2x-100}{x}$ and $y$ is False.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50 \times \frac{2x-100}{x} & \displaystyle 50\left(1 - \frac{2x-100}{x}\right) \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{3x-100}{2x}$$
- Precision is now:
$$\frac{x}{100}$$
- And Recall is still 100\%.


In [5]:
for x in range (100, 45, -5):
    a = int((300*x-10000)/(2*x))
    p = int(x)
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/x ) & (data['y']==False), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 97 95 100
90 94 90 100
85 91 85 100
80 87 80 100
75 83 75 100
70 78 70 100
65 73 65 100
60 66 60 100
55 59 55 100
50 50 50 100


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_97_Pre_95_Rec_100,Acc_94_Pre_90_Rec_100,Acc_91_Pre_85_Rec_100,Acc_87_Pre_80_Rec_100,Acc_83_Pre_75_Rec_100,Acc_78_Pre_70_Rec_100,Acc_73_Pre_65_Rec_100,Acc_66_Pre_60_Rec_100,Acc_59_Pre_55_Rec_100,Acc_50_Pre_50_Rec_100
0,False,False,False,True,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1,False,False,False,False,True,True,False,False,True,True,...,False,False,False,False,True,False,False,False,True,True
2,False,False,True,False,False,False,False,False,True,True,...,False,False,True,True,False,True,True,True,True,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,True,True,True
4,False,False,False,False,False,False,False,False,True,True,...,False,True,False,False,False,False,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,True,True,True,False,True,True,False,False,True,False,...,True,True,True,True,True,True,True,True,True,True
99996,True,True,True,True,False,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True
99997,True,True,True,True,True,False,True,True,False,False,...,True,True,True,True,True,True,True,True,True,True
99998,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [6]:
for x in range (100, 45, -5):
    a = int((300*x-10000)/(2*x))
    p = int(x)
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 97 95 100
Acc_97_Pre_95_Rec_100
[[47.372  2.628]
 [ 0.    50.   ]]
97.37 95.01 100.0

90 94 90 100
Acc_94_Pre_90_Rec_100
[[44.676  5.324]
 [ 0.    50.   ]]
94.68 90.38 100.0

85 91 85 100
Acc_91_Pre_85_Rec_100
[[41.188  8.812]
 [ 0.    50.   ]]
91.19 85.02 100.0

80 87 80 100
Acc_87_Pre_80_Rec_100
[[37.537 12.463]
 [ 0.    50.   ]]
87.54 80.05 100.0

75 83 75 100
Acc_83_Pre_75_Rec_100
[[33.456 16.544]
 [ 0.    50.   ]]
83.46 75.14 100.0

70 78 70 100
Acc_78_Pre_70_Rec_100
[[28.631 21.369]
 [ 0.    50.   ]]
78.63 70.06 100.0

65 73 65 100
Acc_73_Pre_65_Rec_100
[[23.167 26.833]
 [ 0.    50.   ]]
73.17 65.08 100.0

60 66 60 100
Acc_66_Pre_60_Rec_100
[[16.528 33.472]
 [ 0.    50.   ]]
66.53 59.9 100.0

55 59 55 100
Acc_59_Pre_55_Rec_100
[[ 9.126 40.874]
 [ 0.    50.   ]]
59.13 55.02 100.0

50 50 50 100
Acc_50_Pre_50_Rec_100
[[ 0. 50.]
 [ 0. 50.]]
50.0 50.0 100.0



# Create x-Features with Perfect Precision but Decreasing Recall by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>x/100$ and $y$ is True.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50 & 0 \cr
    \displaystyle 50\left(1 - \frac{x}{100}\right) & \displaystyle 50 \times \frac{x}{100} \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{100-x}{200}$$
- Precision is still 100\%.
- And Recall is now:
$$\frac{x}{100}$$



In [7]:
for x in range (100, 45, -5):
    a = int((100+x)/2)
    p = 100
    r = int(x)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > x/100) & (data['y']==True), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 97 100 95
90 95 100 90
85 92 100 85
80 90 100 80
75 87 100 75
70 85 100 70
65 82 100 65
60 80 100 60
55 77 100 55
50 75 100 50


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_97_Pre_100_Rec_95,Acc_95_Pre_100_Rec_90,Acc_92_Pre_100_Rec_85,Acc_90_Pre_100_Rec_80,Acc_87_Pre_100_Rec_75,Acc_85_Pre_100_Rec_70,Acc_82_Pre_100_Rec_65,Acc_80_Pre_100_Rec_60,Acc_77_Pre_100_Rec_55,Acc_75_Pre_100_Rec_50
0,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,True,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,True,True,True,False,True,True,False,False,True,False,...,True,True,False,True,True,True,False,True,True,True
99996,True,True,True,True,False,True,True,False,True,True,...,True,True,True,True,True,False,True,False,False,True
99997,True,True,True,True,True,False,True,True,False,False,...,True,True,True,True,True,True,True,True,False,True
99998,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,False,True,False


In [8]:
for x in range (100, 45, -5):
    a = int((100+x)/2)
    p = 100
    r = int(x)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 97 100 95
Acc_97_Pre_100_Rec_95
[[50.     0.   ]
 [ 2.635 47.365]]
97.36 100.0 94.73

90 95 100 90
Acc_95_Pre_100_Rec_90
[[50.    0.  ]
 [ 5.05 44.95]]
94.95 100.0 89.9

85 92 100 85
Acc_92_Pre_100_Rec_85
[[50.     0.   ]
 [ 7.416 42.584]]
92.58 100.0 85.17

80 90 100 80
Acc_90_Pre_100_Rec_80
[[50.     0.   ]
 [ 9.864 40.136]]
90.14 100.0 80.27

75 87 100 75
Acc_87_Pre_100_Rec_75
[[50.     0.   ]
 [12.447 37.553]]
87.55 100.0 75.11

70 85 100 70
Acc_85_Pre_100_Rec_70
[[50.     0.   ]
 [14.937 35.063]]
85.06 100.0 70.13

65 82 100 65
Acc_82_Pre_100_Rec_65
[[50.     0.   ]
 [17.608 32.392]]
82.39 100.0 64.78

60 80 100 60
Acc_80_Pre_100_Rec_60
[[50.     0.   ]
 [20.173 29.827]]
79.83 100.0 59.65

55 77 100 55
Acc_77_Pre_100_Rec_55
[[50.     0.   ]
 [22.467 27.533]]
77.53 100.0 55.07

50 75 100 50
Acc_75_Pre_100_Rec_50
[[50.     0.   ]
 [25.087 24.913]]
74.91 100.0 49.83



# Create x-Features with Perfect Recall but Decreasing Accuracy by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>(2x-100)/100$ and $y$ is False.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50\times \frac{2x-100}{100} & \displaystyle 50 \left( 1 -  \frac{2x-100}{100}\right) \cr
    0 & \displaystyle 50  \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{x}{100}$$
- Precision is now:
$$\frac{50}{150-x}$$
- And Recall is still 100\%.




In [9]:
for x in range (100, 45, -5):
    a = int(x)
    p = int(5000/(150-x))
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/100 ) & (data['y']==False), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 95 90 100
90 90 83 100
85 85 76 100
80 80 71 100
75 75 66 100
70 70 62 100
65 65 58 100
60 60 55 100
55 55 52 100
50 50 50 100


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_75_Pre_100_Rec_50,Acc_95_Pre_90_Rec_100,Acc_90_Pre_83_Rec_100,Acc_85_Pre_76_Rec_100,Acc_80_Pre_71_Rec_100,Acc_75_Pre_66_Rec_100,Acc_70_Pre_62_Rec_100,Acc_65_Pre_58_Rec_100,Acc_60_Pre_55_Rec_100,Acc_55_Pre_52_Rec_100
0,False,False,False,True,False,False,True,False,False,False,...,False,False,True,False,False,False,True,False,True,True
1,False,False,False,False,True,True,False,False,True,True,...,False,False,False,True,False,False,True,True,True,True
2,False,False,True,False,False,False,False,False,True,True,...,False,True,True,True,False,True,False,False,True,True
3,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,True,False,True,False,True
4,False,False,False,False,False,False,False,False,True,True,...,False,False,False,False,False,False,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,True,True,True,False,True,True,False,False,True,False,...,True,True,True,True,True,True,True,True,True,True
99996,True,True,True,True,False,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True
99997,True,True,True,True,True,False,True,True,False,False,...,True,True,True,True,True,True,True,True,True,True
99998,True,True,True,True,True,True,True,True,True,True,...,False,True,True,True,True,True,True,True,True,True


In [10]:
for x in range (100, 45, -5):
    a = int(x)
    p = int(5000/(150-x))
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 95 90 100
Acc_95_Pre_90_Rec_100
[[45.034  4.966]
 [ 0.    50.   ]]
95.03 90.97 100.0

90 90 83 100
Acc_90_Pre_83_Rec_100
[[40.042  9.958]
 [ 0.    50.   ]]
90.04 83.39 100.0

85 85 76 100
Acc_85_Pre_76_Rec_100
[[34.943 15.057]
 [ 0.    50.   ]]
84.94 76.86 100.0

80 80 71 100
Acc_80_Pre_71_Rec_100
[[29.929 20.071]
 [ 0.    50.   ]]
79.93 71.36 100.0

75 75 66 100
Acc_75_Pre_66_Rec_100
[[25.008 24.992]
 [ 0.    50.   ]]
75.01 66.67 100.0

70 70 62 100
Acc_70_Pre_62_Rec_100
[[20.114 29.886]
 [ 0.    50.   ]]
70.11 62.59 100.0

65 65 58 100
Acc_65_Pre_58_Rec_100
[[15.021 34.979]
 [ 0.    50.   ]]
65.02 58.84 100.0

60 60 55 100
Acc_60_Pre_55_Rec_100
[[10.084 39.916]
 [ 0.    50.   ]]
60.08 55.61 100.0

55 55 52 100
Acc_55_Pre_52_Rec_100
[[ 4.961 45.039]
 [ 0.    50.   ]]
54.96 52.61 100.0

50 50 50 100
Acc_50_Pre_50_Rec_100
[[ 0. 50.]
 [ 0. 50.]]
50.0 50.0 100.0



# Create x-Features with Perfect Precision but Decreasing Accuracy by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>(2x-100)/100$ and $y$ is True.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    \displaystyle 50 \left( 1 -  \frac{2x-100}{100}\right) & \displaystyle 50\times \frac{2x-100}{100}  \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{x}{100}$$
- Precision is still 100%.
- And Recall is now:
$$\frac{2x-100}{100} = \frac{x-50}{50}$$





In [11]:
for x in range (100, 50, -5):
    a = int(x)
    p = 100
    r = int(100*(x-50)/50)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/100 ) & (data['y']==True), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 95 100 90
90 90 100 80
85 85 100 70
80 80 100 60
75 75 100 50
70 70 100 40
65 65 100 30
60 60 100 20
55 55 100 10


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_80_Pre_71_Rec_100,Acc_75_Pre_66_Rec_100,Acc_70_Pre_62_Rec_100,Acc_65_Pre_58_Rec_100,Acc_60_Pre_55_Rec_100,Acc_55_Pre_52_Rec_100,Acc_70_Pre_100_Rec_40,Acc_65_Pre_100_Rec_30,Acc_60_Pre_100_Rec_20,Acc_55_Pre_100_Rec_10
0,False,False,False,True,False,False,True,False,False,False,...,False,False,True,False,True,True,False,False,False,False
1,False,False,False,False,True,True,False,False,True,True,...,False,False,True,True,True,True,False,False,False,False
2,False,False,True,False,False,False,False,False,True,True,...,False,True,False,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,True,True,...,False,False,True,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,True,True,True,False,True,True,False,False,True,False,...,True,True,True,True,True,True,False,False,False,False
99996,True,True,True,True,False,True,True,False,True,True,...,True,True,True,True,True,True,True,False,True,False
99997,True,True,True,True,True,False,True,True,False,False,...,True,True,True,True,True,True,True,True,False,True
99998,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,False,True,False,True


In [12]:
for x in range (100, 50, -5):
    a = int(x)
    p = 100
    r = int(100*(x-50)/50)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 95 100 90
Acc_95_Pre_100_Rec_90
[[50.     0.   ]
 [ 5.134 44.866]]
94.87 100.0 89.73

90 90 100 80
Acc_90_Pre_100_Rec_80
[[50.     0.   ]
 [10.098 39.902]]
89.9 100.0 79.8

85 85 100 70
Acc_85_Pre_100_Rec_70
[[50.     0.   ]
 [15.004 34.996]]
85.0 100.0 69.99

80 80 100 60
Acc_80_Pre_100_Rec_60
[[50.     0.   ]
 [20.098 29.902]]
79.9 100.0 59.8

75 75 100 50
Acc_75_Pre_100_Rec_50
[[50.     0.   ]
 [25.142 24.858]]
74.86 100.0 49.72

70 70 100 40
Acc_70_Pre_100_Rec_40
[[50.     0.   ]
 [29.746 20.254]]
70.25 100.0 40.51

65 65 100 30
Acc_65_Pre_100_Rec_30
[[50.     0.   ]
 [34.812 15.188]]
65.19 100.0 30.38

60 60 100 20
Acc_60_Pre_100_Rec_20
[[50.    0.  ]
 [39.94 10.06]]
60.06 100.0 20.12

55 55 100 10
Acc_55_Pre_100_Rec_10
[[50.     0.   ]
 [44.962  5.038]]
55.04 100.0 10.08



In [13]:
for row in data:
    print (row)

y
Acc_100_Pre_100_Rec_100
Acc_95_Pre_95_Rec_95
Acc_90_Pre_90_Rec_90
Acc_85_Pre_85_Rec_85
Acc_80_Pre_80_Rec_80
Acc_75_Pre_75_Rec_75
Acc_70_Pre_70_Rec_70
Acc_65_Pre_65_Rec_65
Acc_60_Pre_60_Rec_60
Acc_55_Pre_55_Rec_55
Acc_50_Pre_50_Rec_50
Acc_97_Pre_95_Rec_100
Acc_94_Pre_90_Rec_100
Acc_91_Pre_85_Rec_100
Acc_87_Pre_80_Rec_100
Acc_83_Pre_75_Rec_100
Acc_78_Pre_70_Rec_100
Acc_73_Pre_65_Rec_100
Acc_66_Pre_60_Rec_100
Acc_59_Pre_55_Rec_100
Acc_50_Pre_50_Rec_100
Acc_97_Pre_100_Rec_95
Acc_95_Pre_100_Rec_90
Acc_92_Pre_100_Rec_85
Acc_90_Pre_100_Rec_80
Acc_87_Pre_100_Rec_75
Acc_85_Pre_100_Rec_70
Acc_82_Pre_100_Rec_65
Acc_80_Pre_100_Rec_60
Acc_77_Pre_100_Rec_55
Acc_75_Pre_100_Rec_50
Acc_95_Pre_90_Rec_100
Acc_90_Pre_83_Rec_100
Acc_85_Pre_76_Rec_100
Acc_80_Pre_71_Rec_100
Acc_75_Pre_66_Rec_100
Acc_70_Pre_62_Rec_100
Acc_65_Pre_58_Rec_100
Acc_60_Pre_55_Rec_100
Acc_55_Pre_52_Rec_100
Acc_70_Pre_100_Rec_40
Acc_65_Pre_100_Rec_30
Acc_60_Pre_100_Rec_20
Acc_55_Pre_100_Rec_10


# Feature Selection

## Variance Threshold
- I don't think that variance is at all relevant to our situation.  

In [14]:
V = []
for row in data:
    v = data[row].var()
    v = round(v,4)
    V.append([v, row])
V = sorted(V, key=lambda x:x[0], reverse=True)
for row in V:
    print (row)

[0.25, 'y']
[0.25, 'Acc_100_Pre_100_Rec_100']
[0.25, 'Acc_95_Pre_95_Rec_95']
[0.25, 'Acc_90_Pre_90_Rec_90']
[0.25, 'Acc_85_Pre_85_Rec_85']
[0.25, 'Acc_80_Pre_80_Rec_80']
[0.25, 'Acc_75_Pre_75_Rec_75']
[0.25, 'Acc_70_Pre_70_Rec_70']
[0.25, 'Acc_65_Pre_65_Rec_65']
[0.25, 'Acc_60_Pre_60_Rec_60']
[0.25, 'Acc_55_Pre_55_Rec_55']
[0.25, 'Acc_50_Pre_50_Rec_50']
[0.2493, 'Acc_97_Pre_95_Rec_100']
[0.2493, 'Acc_97_Pre_100_Rec_95']
[0.2475, 'Acc_95_Pre_90_Rec_100']
[0.2474, 'Acc_95_Pre_100_Rec_90']
[0.2472, 'Acc_94_Pre_90_Rec_100']
[0.2445, 'Acc_92_Pre_100_Rec_85']
[0.2422, 'Acc_91_Pre_85_Rec_100']
[0.2401, 'Acc_90_Pre_83_Rec_100']
[0.2398, 'Acc_90_Pre_100_Rec_80']
[0.2345, 'Acc_87_Pre_80_Rec_100']
[0.2345, 'Acc_87_Pre_100_Rec_75']
[0.2275, 'Acc_85_Pre_100_Rec_70']
[0.2273, 'Acc_85_Pre_76_Rec_100']
[0.2226, 'Acc_83_Pre_75_Rec_100']
[0.219, 'Acc_82_Pre_100_Rec_65']
[0.2097, 'Acc_80_Pre_71_Rec_100']
[0.2096, 'Acc_80_Pre_100_Rec_60']
[0.2043, 'Acc_78_Pre_70_Rec_100']
[0.1995, 'Acc_77_Pre_100_Rec_55']

In [15]:
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
sel.fit_transform(data)
data_VT = data[data.columns[sel.get_support(indices=True)]]
data_VT.columns.symmetric_difference(data.columns)

Index(['Acc_50_Pre_50_Rec_100', 'Acc_55_Pre_100_Rec_10',
       'Acc_55_Pre_52_Rec_100', 'Acc_59_Pre_55_Rec_100'],
      dtype='object')

## SelectKBest
- Test first with the default scoring function, f_classif,
- Then test with $\chi^2$.

In [16]:
# Create and fit selector
selector = SelectKBest(f_classif, k='all')
selector.fit(data, data['y'])
scores = selector.scores_
Scores = []
for i, feature in enumerate(data):
    Scores.append([round(scores[i],0), feature])
Scores = sorted(Scores, key=lambda x:x[0], reverse=True)
for row in Scores:
    print (row)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_SKB = data.iloc[:,cols]
data_SKB.columns.symmetric_difference(data.columns)

[inf, 'y']
[inf, 'Acc_100_Pre_100_Rec_100']
[901276.0, 'Acc_97_Pre_95_Rec_100']
[nan, 'Acc_50_Pre_50_Rec_100']
[898749.0, 'Acc_97_Pre_100_Rec_95']
[453414.0, 'Acc_95_Pre_90_Rec_100']
[436941.0, 'Acc_95_Pre_100_Rec_90']
[420592.0, 'Acc_95_Pre_95_Rec_95']
[419563.0, 'Acc_94_Pre_90_Rec_100']
[287103.0, 'Acc_92_Pre_100_Rec_85']
[233699.0, 'Acc_91_Pre_85_Rec_100']
[201050.0, 'Acc_90_Pre_83_Rec_100']
[197570.0, 'Acc_90_Pre_100_Rec_80']
[174347.0, 'Acc_90_Pre_90_Rec_90']
[150849.0, 'Acc_87_Pre_100_Rec_75']
[150591.0, 'Acc_87_Pre_80_Rec_100']
[116620.0, 'Acc_85_Pre_100_Rec_70']
[116033.0, 'Acc_85_Pre_76_Rec_100']
[101110.0, 'Acc_83_Pre_75_Rec_100']
[95690.0, 'Acc_85_Pre_85_Rec_85']
[91979.0, 'Acc_82_Pre_100_Rec_65']
[74556.0, 'Acc_80_Pre_71_Rec_100']
[74389.0, 'Acc_80_Pre_100_Rec_60']
[66991.0, 'Acc_78_Pre_70_Rec_100']
[61273.0, 'Acc_77_Pre_100_Rec_55']
[56003.0, 'Acc_80_Pre_80_Rec_80']
[50031.0, 'Acc_75_Pre_66_Rec_100']
[49434.0, 'Acc_75_Pre_100_Rec_50']
[43168.0, 'Acc_73_Pre_65_Rec_100']
[34

  f = msb / msw
  f = msb / msw


Index([], dtype='object')

In [17]:
# Create and fit selector
selector = SelectKBest(chi2, k='all')
selector.fit(data, data['y'])
scores = selector.scores_
Scores = []
for i, feature in enumerate(data):
    Scores.append([round(scores[i],0), feature])
Scores = sorted(Scores, key=lambda x:x[0], reverse=True)
for row in Scores:
    print (row)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_SKB = data.iloc[:,cols]
data_SKB.columns.symmetric_difference(data.columns)

[50000.0, 'y']
[50000.0, 'Acc_100_Pre_100_Rec_100']
[47365.0, 'Acc_97_Pre_100_Rec_95']
[44866.0, 'Acc_95_Pre_100_Rec_90']
[42641.0, 'Acc_97_Pre_95_Rec_100']
[42584.0, 'Acc_92_Pre_100_Rec_85']
[40438.0, 'Acc_95_Pre_95_Rec_95']
[39902.0, 'Acc_90_Pre_100_Rec_80']
[37553.0, 'Acc_87_Pre_100_Rec_75']
[36897.0, 'Acc_95_Pre_90_Rec_100']
[36077.0, 'Acc_94_Pre_90_Rec_100']
[34996.0, 'Acc_85_Pre_100_Rec_70']
[32392.0, 'Acc_82_Pre_100_Rec_65']
[31881.0, 'Acc_90_Pre_90_Rec_90']
[29902.0, 'Acc_80_Pre_100_Rec_60']
[28845.0, 'Acc_91_Pre_85_Rec_100']
[27533.0, 'Acc_77_Pre_100_Rec_55']
[26741.0, 'Acc_90_Pre_83_Rec_100']
[24858.0, 'Acc_75_Pre_100_Rec_50']
[24422.0, 'Acc_85_Pre_85_Rec_85']
[22558.0, 'Acc_87_Pre_80_Rec_100']
[20254.0, 'Acc_70_Pre_100_Rec_40']
[18768.0, 'Acc_85_Pre_76_Rec_100']
[17927.0, 'Acc_80_Pre_80_Rec_80']
[16821.0, 'Acc_83_Pre_75_Rec_100']
[15188.0, 'Acc_65_Pre_100_Rec_30']
[12783.0, 'Acc_80_Pre_71_Rec_100']
[12607.0, 'Acc_75_Pre_75_Rec_75']
[11486.0, 'Acc_78_Pre_70_Rec_100']
[10060.0

Index([], dtype='object')

In [18]:
# Create and fit selector
selector = SelectKBest(mutual_info_classif, k='all')
selector.fit(data, data['y'])
scores = selector.scores_
Scores = []
for i, feature in enumerate(data):
    Scores.append([round(scores[i],0), feature])
Scores = sorted(Scores, key=lambda x:x[0], reverse=True)
for row in Scores:
    print (row)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_SKB = data.iloc[:,cols]
data_SKB.columns.symmetric_difference(data.columns)

[1.0, 'y']
[1.0, 'Acc_100_Pre_100_Rec_100']
[1.0, 'Acc_97_Pre_95_Rec_100']
[1.0, 'Acc_94_Pre_90_Rec_100']
[1.0, 'Acc_97_Pre_100_Rec_95']
[1.0, 'Acc_95_Pre_100_Rec_90']
[1.0, 'Acc_95_Pre_90_Rec_100']
[0.0, 'Acc_95_Pre_95_Rec_95']
[0.0, 'Acc_90_Pre_90_Rec_90']
[0.0, 'Acc_85_Pre_85_Rec_85']
[0.0, 'Acc_80_Pre_80_Rec_80']
[0.0, 'Acc_75_Pre_75_Rec_75']
[0.0, 'Acc_70_Pre_70_Rec_70']
[0.0, 'Acc_65_Pre_65_Rec_65']
[0.0, 'Acc_60_Pre_60_Rec_60']
[0.0, 'Acc_55_Pre_55_Rec_55']
[0.0, 'Acc_50_Pre_50_Rec_50']
[0.0, 'Acc_91_Pre_85_Rec_100']
[0.0, 'Acc_87_Pre_80_Rec_100']
[0.0, 'Acc_83_Pre_75_Rec_100']
[0.0, 'Acc_78_Pre_70_Rec_100']
[0.0, 'Acc_73_Pre_65_Rec_100']
[0.0, 'Acc_66_Pre_60_Rec_100']
[0.0, 'Acc_59_Pre_55_Rec_100']
[0.0, 'Acc_50_Pre_50_Rec_100']
[0.0, 'Acc_92_Pre_100_Rec_85']
[0.0, 'Acc_90_Pre_100_Rec_80']
[0.0, 'Acc_87_Pre_100_Rec_75']
[0.0, 'Acc_85_Pre_100_Rec_70']
[0.0, 'Acc_82_Pre_100_Rec_65']
[0.0, 'Acc_80_Pre_100_Rec_60']
[0.0, 'Acc_77_Pre_100_Rec_55']
[0.0, 'Acc_75_Pre_100_Rec_50']
[0.0,

Index([], dtype='object')