# Import Libraries

In [20]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Create y-values

In [2]:
n = 1000000
A = [int(x/(n/2)) for x in range (n)]
D = {'y':A}
data = pd.DataFrame(D).astype(bool)
data

Unnamed: 0,y
0,False
1,False
2,False
3,False
4,False
...,...
999995,True
999996,True
999997,True
999998,True


# Create x-features with Decreasing Levels of Accuracy, Keeping Recall and Precision Balanced

In [3]:
for x in range (100, 45, -5):
    s = 'Acc_' + str(x) + '_Pre_' + str(x) + '_Rec_' + str(x)
    noise = np.random.random(n)
    data[s] = np.where ( noise > x/100, np.logical_not(data['y']), data['y'] )
data

Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,Acc_55_Pre_55_Rec_55,Acc_50_Pre_50_Rec_50
0,False,False,False,False,True,False,False,False,False,True,True,False
1,False,False,False,False,True,False,True,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,True,True
3,False,False,True,False,False,True,False,True,False,False,False,False
4,False,False,False,False,True,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,True,True,True,True,True,False,True,True,False,True,False,True
999996,True,True,True,True,True,True,True,True,True,True,True,True
999997,True,True,True,False,True,True,True,False,True,False,False,True
999998,True,True,True,True,True,True,False,True,True,False,False,False


In [4]:
for x in range (100, 45, -5):
    s = 'Acc_' + str(x) + '_Pre_' + str(x) + '_Rec_' + str(x)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, rec, pre)
    print ()

Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

Acc_95_Pre_95_Rec_95
[[47.4999  2.5001]
 [ 2.5238 47.4762]]
94.98 94.95 95.0

Acc_90_Pre_90_Rec_90
[[44.9859  5.0141]
 [ 4.9993 45.0007]]
89.99 90.0 89.97

Acc_85_Pre_85_Rec_85
[[42.519   7.481 ]
 [ 7.4934 42.5066]]
85.03 85.01 85.03

Acc_80_Pre_80_Rec_80
[[40.016   9.984 ]
 [ 9.9718 40.0282]]
80.04 80.06 80.04

Acc_75_Pre_75_Rec_75
[[37.5553 12.4447]
 [12.5344 37.4656]]
75.02 74.93 75.07

Acc_70_Pre_70_Rec_70
[[34.9819 15.0181]
 [14.9525 35.0475]]
70.03 70.1 70.0

Acc_65_Pre_65_Rec_65
[[32.4585 17.5415]
 [17.5367 32.4633]]
64.92 64.93 64.92

Acc_60_Pre_60_Rec_60
[[30.0286 19.9714]
 [19.9974 30.0026]]
60.03 60.01 60.04

Acc_55_Pre_55_Rec_55
[[27.5358 22.4642]
 [22.4869 27.5131]]
55.05 55.03 55.05

Acc_50_Pre_50_Rec_50
[[24.9244 25.0756]
 [24.9755 25.0245]]
49.95 50.05 49.95



# Create x-Features with Perfect Recall but Decreasing Precision by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>\frac{2x-100}{x}$ and $y$ is False.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50 \times \frac{2x-100}{x} & \displaystyle 50\left(1 - \frac{2x-100}{x}\right) \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{3x-100}{2x}$$
- Precision is now:
$$\frac{x}{100}$$
- And Recall is still 100\%.


In [5]:
for x in range (100, 45, -5):
    a = int((300*x-10000)/(2*x))
    p = int(x)
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/x ) & (data['y']==False), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 97 95 100
90 94 90 100
85 91 85 100
80 87 80 100
75 83 75 100
70 78 70 100
65 73 65 100
60 66 60 100
55 59 55 100
50 50 50 100


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_97_Pre_95_Rec_100,Acc_94_Pre_90_Rec_100,Acc_91_Pre_85_Rec_100,Acc_87_Pre_80_Rec_100,Acc_83_Pre_75_Rec_100,Acc_78_Pre_70_Rec_100,Acc_73_Pre_65_Rec_100,Acc_66_Pre_60_Rec_100,Acc_59_Pre_55_Rec_100,Acc_50_Pre_50_Rec_100
0,False,False,False,False,True,False,False,False,False,True,...,False,True,False,False,True,False,True,False,True,True
1,False,False,False,False,True,False,True,False,False,False,...,False,True,False,False,False,False,False,True,False,True
2,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,True,True
3,False,False,True,False,False,True,False,True,False,False,...,False,False,False,True,False,False,True,True,False,True
4,False,False,False,False,True,False,False,False,True,True,...,False,False,False,False,False,False,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,True,True,True,True,True,False,True,True,False,True,...,True,True,True,True,True,True,True,True,True,True
999996,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
999997,True,True,True,False,True,True,True,False,True,False,...,True,True,True,True,True,True,True,True,True,True
999998,True,True,True,True,True,True,False,True,True,False,...,True,True,True,True,True,True,True,True,True,True


In [6]:
for x in range (100, 45, -5):
    a = int((300*x-10000)/(2*x))
    p = int(x)
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 97 95 100
Acc_97_Pre_95_Rec_100
[[47.3609  2.6391]
 [ 0.     50.    ]]
97.36 94.99 100.0

90 94 90 100
Acc_94_Pre_90_Rec_100
[[44.4476  5.5524]
 [ 0.     50.    ]]
94.45 90.01 100.0

85 91 85 100
Acc_91_Pre_85_Rec_100
[[41.1962  8.8038]
 [ 0.     50.    ]]
91.2 85.03 100.0

80 87 80 100
Acc_87_Pre_80_Rec_100
[[37.5205 12.4795]
 [ 0.     50.    ]]
87.52 80.03 100.0

75 83 75 100
Acc_83_Pre_75_Rec_100
[[33.3501 16.6499]
 [ 0.     50.    ]]
83.35 75.02 100.0

70 78 70 100
Acc_78_Pre_70_Rec_100
[[28.5964 21.4036]
 [ 0.     50.    ]]
78.6 70.02 100.0

65 73 65 100
Acc_73_Pre_65_Rec_100
[[23.1331 26.8669]
 [ 0.     50.    ]]
73.13 65.05 100.0

60 66 60 100
Acc_66_Pre_60_Rec_100
[[16.6714 33.3286]
 [ 0.     50.    ]]
66.67 60.0 100.0

55 59 55 100
Acc_59_Pre_55_Rec_100
[[ 9.0986 40.9014]
 [ 0.     50.    ]]
59.1 55.0 100.0

50 50 50 100
Acc_50_Pre_50_Rec_100
[[ 0. 50.]
 [ 0. 50.]]
50.0 50.0 100.0



# Create x-Features with Perfect Precision but Decreasing Recall by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>x/100$ and $y$ is True.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50 & 0 \cr
    \displaystyle 50\left(1 - \frac{x}{100}\right) & \displaystyle 50 \times \frac{x}{100} \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{100-x}{200}$$
- Precision is still 100\%.
- And Recall is now:
$$\frac{x}{100}$$



In [7]:
for x in range (100, 45, -5):
    a = int((100+x)/2)
    p = 100
    r = int(x)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > x/100) & (data['y']==True), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 97 100 95
90 95 100 90
85 92 100 85
80 90 100 80
75 87 100 75
70 85 100 70
65 82 100 65
60 80 100 60
55 77 100 55
50 75 100 50


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_97_Pre_100_Rec_95,Acc_95_Pre_100_Rec_90,Acc_92_Pre_100_Rec_85,Acc_90_Pre_100_Rec_80,Acc_87_Pre_100_Rec_75,Acc_85_Pre_100_Rec_70,Acc_82_Pre_100_Rec_65,Acc_80_Pre_100_Rec_60,Acc_77_Pre_100_Rec_55,Acc_75_Pre_100_Rec_50
0,False,False,False,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,True,True,True,True,True,False,True,True,False,True,...,True,True,True,True,False,True,True,True,True,True
999996,True,True,True,True,True,True,True,True,True,True,...,True,False,True,True,True,True,False,False,False,False
999997,True,True,True,False,True,True,True,False,True,False,...,True,True,False,True,True,True,True,False,False,True
999998,True,True,True,True,True,True,False,True,True,False,...,True,True,True,True,False,True,True,False,True,False


In [8]:
for x in range (100, 45, -5):
    a = int((100+x)/2)
    p = 100
    r = int(x)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 97 100 95
Acc_97_Pre_100_Rec_95
[[50.      0.    ]
 [ 2.4938 47.5062]]
97.51 100.0 95.01

90 95 100 90
Acc_95_Pre_100_Rec_90
[[50.      0.    ]
 [ 4.9778 45.0222]]
95.02 100.0 90.04

85 92 100 85
Acc_92_Pre_100_Rec_85
[[50.      0.    ]
 [ 7.4685 42.5315]]
92.53 100.0 85.06

80 90 100 80
Acc_90_Pre_100_Rec_80
[[50.      0.    ]
 [ 9.9881 40.0119]]
90.01 100.0 80.02

75 87 100 75
Acc_87_Pre_100_Rec_75
[[50.      0.    ]
 [12.4926 37.5074]]
87.51 100.0 75.01

70 85 100 70
Acc_85_Pre_100_Rec_70
[[50.      0.    ]
 [15.0423 34.9577]]
84.96 100.0 69.92

65 82 100 65
Acc_82_Pre_100_Rec_65
[[50.      0.    ]
 [17.5041 32.4959]]
82.5 100.0 64.99

60 80 100 60
Acc_80_Pre_100_Rec_60
[[50.      0.    ]
 [19.9986 30.0014]]
80.0 100.0 60.0

55 77 100 55
Acc_77_Pre_100_Rec_55
[[50.      0.    ]
 [22.4957 27.5043]]
77.5 100.0 55.01

50 75 100 50
Acc_75_Pre_100_Rec_50
[[50.      0.    ]
 [25.0045 24.9955]]
75.0 100.0 

# Create x-Features with Perfect Recall but Decreasing Accuracy by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>(2x-100)/100$ and $y$ is False.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50\times \frac{2x-100}{100} & \displaystyle 50 \left( 1 -  \frac{2x-100}{100}\right) \cr
    0 & \displaystyle 50  \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{x}{100}$$
- Precision is now:
$$\frac{50}{150-x}$$
- And Recall is still 100\%.




In [9]:
for x in range (100, 45, -5):
    a = int(x)
    p = int(5000/(150-x))
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/100 ) & (data['y']==False), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 95 90 100
90 90 83 100
85 85 76 100
80 80 71 100
75 75 66 100
70 70 62 100
65 65 58 100
60 60 55 100
55 55 52 100
50 50 50 100


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_75_Pre_100_Rec_50,Acc_95_Pre_90_Rec_100,Acc_90_Pre_83_Rec_100,Acc_85_Pre_76_Rec_100,Acc_80_Pre_71_Rec_100,Acc_75_Pre_66_Rec_100,Acc_70_Pre_62_Rec_100,Acc_65_Pre_58_Rec_100,Acc_60_Pre_55_Rec_100,Acc_55_Pre_52_Rec_100
0,False,False,False,False,True,False,False,False,False,True,...,False,False,False,False,True,True,True,True,False,True
1,False,False,False,False,True,False,True,False,False,False,...,False,False,True,False,True,False,True,False,True,True
2,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,True,True,False,False,False,True
3,False,False,True,False,False,True,False,True,False,False,...,False,False,True,False,False,False,False,True,True,False
4,False,False,False,False,True,False,False,False,True,True,...,False,False,False,False,False,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,True,True,True,True,True,False,True,True,False,True,...,True,True,True,True,True,True,True,True,True,True
999996,True,True,True,True,True,True,True,True,True,True,...,False,True,True,True,True,True,True,True,True,True
999997,True,True,True,False,True,True,True,False,True,False,...,True,True,True,True,True,True,True,True,True,True
999998,True,True,True,True,True,True,False,True,True,False,...,False,True,True,True,True,True,True,True,True,True


In [10]:
for x in range (100, 45, -5):
    a = int(x)
    p = int(5000/(150-x))
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 95 90 100
Acc_95_Pre_90_Rec_100
[[44.999  5.001]
 [ 0.    50.   ]]
95.0 90.91 100.0

90 90 83 100
Acc_90_Pre_83_Rec_100
[[39.988 10.012]
 [ 0.    50.   ]]
89.99 83.32 100.0

85 85 76 100
Acc_85_Pre_76_Rec_100
[[35.0439 14.9561]
 [ 0.     50.    ]]
85.04 76.98 100.0

80 80 71 100
Acc_80_Pre_71_Rec_100
[[29.9985 20.0015]
 [ 0.     50.    ]]
80.0 71.43 100.0

75 75 66 100
Acc_75_Pre_66_Rec_100
[[25.0069 24.9931]
 [ 0.     50.    ]]
75.01 66.67 100.0

70 70 62 100
Acc_70_Pre_62_Rec_100
[[19.9794 30.0206]
 [ 0.     50.    ]]
69.98 62.48 100.0

65 65 58 100
Acc_65_Pre_58_Rec_100
[[14.9423 35.0577]
 [ 0.     50.    ]]
64.94 58.78 100.0

60 60 55 100
Acc_60_Pre_55_Rec_100
[[10.0183 39.9817]
 [ 0.     50.    ]]
60.02 55.57 100.0

55 55 52 100
Acc_55_Pre_52_Rec_100
[[ 4.9823 45.0177]
 [ 0.     50.    ]]
54.98 52.62 100.0

50 50 50 100
Acc_50_Pre_50_Rec_100
[[ 0. 50.]
 [ 0. 50.]]
50.0 50.0 100.0



# Create x-Features with Perfect Precision but Decreasing Accuracy by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>(2x-100)/100$ and $y$ is True.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    \displaystyle 50 \left( 1 -  \frac{2x-100}{100}\right) & \displaystyle 50\times \frac{2x-100}{100}  \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{x}{100}$$
- Precision is still 100%.
- And Recall is now:
$$\frac{2x-100}{100} = \frac{x-50}{50}$$





In [11]:
for x in range (100, 50, -5):
    a = int(x)
    p = 100
    r = int(100*(x-50)/50)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/100 ) & (data['y']==True), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 95 100 90
90 90 100 80
85 85 100 70
80 80 100 60
75 75 100 50
70 70 100 40
65 65 100 30
60 60 100 20
55 55 100 10


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_80_Pre_71_Rec_100,Acc_75_Pre_66_Rec_100,Acc_70_Pre_62_Rec_100,Acc_65_Pre_58_Rec_100,Acc_60_Pre_55_Rec_100,Acc_55_Pre_52_Rec_100,Acc_70_Pre_100_Rec_40,Acc_65_Pre_100_Rec_30,Acc_60_Pre_100_Rec_20,Acc_55_Pre_100_Rec_10
0,False,False,False,False,True,False,False,False,False,True,...,True,True,True,True,False,True,False,False,False,False
1,False,False,False,False,True,False,True,False,False,False,...,True,False,True,False,True,True,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,...,True,True,False,False,False,True,False,False,False,False
3,False,False,True,False,False,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
4,False,False,False,False,True,False,False,False,True,True,...,False,True,True,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,True,True,True,True,True,False,True,True,False,True,...,True,True,True,True,True,True,False,True,False,False
999996,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,False,False,True,False
999997,True,True,True,False,True,True,True,False,True,False,...,True,True,True,True,True,True,True,False,False,False
999998,True,True,True,True,True,True,False,True,True,False,...,True,True,True,True,True,True,True,True,True,False


In [12]:
for x in range (100, 50, -5):
    a = int(x)
    p = 100
    r = int(100*(x-50)/50)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 95 100 90
Acc_95_Pre_100_Rec_90
[[50.      0.    ]
 [ 5.0254 44.9746]]
94.97 100.0 89.95

90 90 100 80
Acc_90_Pre_100_Rec_80
[[50.      0.    ]
 [10.0302 39.9698]]
89.97 100.0 79.94

85 85 100 70
Acc_85_Pre_100_Rec_70
[[50.      0.    ]
 [14.9559 35.0441]]
85.04 100.0 70.09

80 80 100 60
Acc_80_Pre_100_Rec_60
[[50.      0.    ]
 [20.0091 29.9909]]
79.99 100.0 59.98

75 75 100 50
Acc_75_Pre_100_Rec_50
[[50.      0.    ]
 [25.0255 24.9745]]
74.97 100.0 49.95

70 70 100 40
Acc_70_Pre_100_Rec_40
[[50.      0.    ]
 [29.9786 20.0214]]
70.02 100.0 40.04

65 65 100 30
Acc_65_Pre_100_Rec_30
[[50.      0.    ]
 [35.0757 14.9243]]
64.92 100.0 29.85

60 60 100 20
Acc_60_Pre_100_Rec_20
[[50.      0.    ]
 [40.0011  9.9989]]
60.0 100.0 20.0

55 55 100 10
Acc_55_Pre_100_Rec_10
[[50.      0.    ]
 [44.9814  5.0186]]
55.02 100.0 10.04



In [13]:
for row in data:
    print (row)

y
Acc_100_Pre_100_Rec_100
Acc_95_Pre_95_Rec_95
Acc_90_Pre_90_Rec_90
Acc_85_Pre_85_Rec_85
Acc_80_Pre_80_Rec_80
Acc_75_Pre_75_Rec_75
Acc_70_Pre_70_Rec_70
Acc_65_Pre_65_Rec_65
Acc_60_Pre_60_Rec_60
Acc_55_Pre_55_Rec_55
Acc_50_Pre_50_Rec_50
Acc_97_Pre_95_Rec_100
Acc_94_Pre_90_Rec_100
Acc_91_Pre_85_Rec_100
Acc_87_Pre_80_Rec_100
Acc_83_Pre_75_Rec_100
Acc_78_Pre_70_Rec_100
Acc_73_Pre_65_Rec_100
Acc_66_Pre_60_Rec_100
Acc_59_Pre_55_Rec_100
Acc_50_Pre_50_Rec_100
Acc_97_Pre_100_Rec_95
Acc_95_Pre_100_Rec_90
Acc_92_Pre_100_Rec_85
Acc_90_Pre_100_Rec_80
Acc_87_Pre_100_Rec_75
Acc_85_Pre_100_Rec_70
Acc_82_Pre_100_Rec_65
Acc_80_Pre_100_Rec_60
Acc_77_Pre_100_Rec_55
Acc_75_Pre_100_Rec_50
Acc_95_Pre_90_Rec_100
Acc_90_Pre_83_Rec_100
Acc_85_Pre_76_Rec_100
Acc_80_Pre_71_Rec_100
Acc_75_Pre_66_Rec_100
Acc_70_Pre_62_Rec_100
Acc_65_Pre_58_Rec_100
Acc_60_Pre_55_Rec_100
Acc_55_Pre_52_Rec_100
Acc_70_Pre_100_Rec_40
Acc_65_Pre_100_Rec_30
Acc_60_Pre_100_Rec_20
Acc_55_Pre_100_Rec_10


# Feature Selection

## Variance Threshold


In [21]:
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
sel.fit_transform(data)
data_VT = data[data.columns[sel.get_support(indices=True)]]
data_VT.columns.symmetric_difference(data.columns)

Index(['Acc_50_Pre_50_Rec_100', 'Acc_55_Pre_100_Rec_10',
       'Acc_55_Pre_52_Rec_100', 'Acc_59_Pre_55_Rec_100',
       'Acc_60_Pre_100_Rec_20'],
      dtype='object')

## SelectKBest

In [29]:
# Create and fit selector
selector = SelectKBest(chi2, k=35)
selector.fit(data, data['y'])
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_SKB = data.iloc[:,cols]
data_SKB.columns.symmetric_difference(data.columns)

Index(['Acc_50_Pre_50_Rec_100', 'Acc_50_Pre_50_Rec_50',
       'Acc_55_Pre_52_Rec_100', 'Acc_55_Pre_55_Rec_55',
       'Acc_59_Pre_55_Rec_100', 'Acc_60_Pre_55_Rec_100',
       'Acc_60_Pre_60_Rec_60', 'Acc_65_Pre_58_Rec_100', 'Acc_65_Pre_65_Rec_65',
       'Acc_66_Pre_60_Rec_100'],
      dtype='object')