# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency


import random


from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

# Create y-values

In [19]:
n = 1000
# If imbalance == 0.5, the set is balanced.
# if imbalance == 0.1, then the set is 90% zeroes and 10% ones.
imbalance = 0.5 
A = [int(x/(imbalance * n)) for x in range (n)] # Balanced Data Set
D = {'y':A}
data = pd.DataFrame(D).astype(bool)
data

Unnamed: 0,y
0,False
1,False
2,False
3,False
4,False
...,...
995,True
996,True
997,True
998,True


# Create x-features with Decreasing Levels of Accuracy, Keeping Recall and Precision Balanced

In [3]:
for x in range (100, 45, -5):
    s = 'Acc_' + str(x) + '_Pre_' + str(x) + '_Rec_' + str(x)
    noise = np.random.random(n)
    data[s] = np.where ( noise > x/100, np.logical_not(data['y']), data['y'] )
data

Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,Acc_55_Pre_55_Rec_55,Acc_50_Pre_50_Rec_50
0,False,False,False,False,False,False,False,False,False,False,True,True
1,False,False,False,False,True,False,False,True,False,True,False,True
2,False,False,False,False,False,True,False,False,False,True,True,False
3,False,False,False,False,False,True,False,True,True,True,False,False
4,False,False,False,True,False,False,True,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,False,True,False,False,True,False,False,False
996,True,True,False,True,True,False,False,True,True,False,True,True
997,True,True,True,True,True,True,True,True,False,False,False,False
998,True,True,True,True,True,True,False,False,False,False,True,False


In [4]:
for x in range (100, 45, -5):
    s = 'Acc_' + str(x) + '_Pre_' + str(x) + '_Rec_' + str(x)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, rec, pre)
    print ()

Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

Acc_95_Pre_95_Rec_95
[[47.9  2.1]
 [ 2.  48. ]]
95.9 96.0 95.81

Acc_90_Pre_90_Rec_90
[[44.9  5.1]
 [ 3.8 46.2]]
91.1 92.4 90.06

Acc_85_Pre_85_Rec_85
[[43.7  6.3]
 [ 7.4 42.6]]
86.3 85.2 87.12

Acc_80_Pre_80_Rec_80
[[41.4  8.6]
 [11.  39. ]]
80.4 78.0 81.93

Acc_75_Pre_75_Rec_75
[[36.5 13.5]
 [13.3 36.7]]
73.2 73.4 73.11

Acc_70_Pre_70_Rec_70
[[34.8 15.2]
 [15.  35. ]]
69.8 70.0 69.72

Acc_65_Pre_65_Rec_65
[[33.  17. ]
 [19.7 30.3]]
63.3 60.6 64.06

Acc_60_Pre_60_Rec_60
[[29.1 20.9]
 [17.6 32.4]]
61.5 64.8 60.79

Acc_55_Pre_55_Rec_55
[[27.4 22.6]
 [19.1 30.9]]
58.3 61.8 57.76

Acc_50_Pre_50_Rec_50
[[23.4 26.6]
 [24.6 25.4]]
48.8 50.8 48.85



# Create x-Features with Perfect Recall but Decreasing Precision by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>\frac{2x-100}{x}$ and $y$ is False.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50 \times \frac{2x-100}{x} & \displaystyle 50\left(1 - \frac{2x-100}{x}\right) \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{3x-100}{2x}$$
- Precision is now:
$$\frac{x}{100}$$
- And Recall is still 100\%.


In [5]:
for x in range (100, 45, -5):
    a = int((300*x-10000)/(2*x))
    p = int(x)
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/x ) & (data['y']==False), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 97 95 100
90 94 90 100
85 91 85 100
80 87 80 100
75 83 75 100
70 78 70 100
65 73 65 100
60 66 60 100
55 59 55 100
50 50 50 100


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_97_Pre_95_Rec_100,Acc_94_Pre_90_Rec_100,Acc_91_Pre_85_Rec_100,Acc_87_Pre_80_Rec_100,Acc_83_Pre_75_Rec_100,Acc_78_Pre_70_Rec_100,Acc_73_Pre_65_Rec_100,Acc_66_Pre_60_Rec_100,Acc_59_Pre_55_Rec_100,Acc_50_Pre_50_Rec_100
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
1,False,False,False,False,True,False,False,True,False,True,...,False,False,False,False,False,True,True,True,True,True
2,False,False,False,False,False,True,False,False,False,True,...,True,True,True,False,False,True,True,True,True,True
3,False,False,False,False,False,True,False,True,True,True,...,False,True,True,False,True,False,False,False,True,True
4,False,False,False,True,False,False,True,False,False,True,...,False,False,False,False,False,True,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,True,True,True,True
996,True,True,False,True,True,False,False,True,True,False,...,True,True,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True,False,False,...,True,True,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True


In [6]:
for x in range (100, 45, -5):
    a = int((300*x-10000)/(2*x))
    p = int(x)
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 97 95 100
Acc_97_Pre_95_Rec_100
[[47.2  2.8]
 [ 0.  50. ]]
97.2 94.7 100.0

90 94 90 100
Acc_94_Pre_90_Rec_100
[[45.3  4.7]
 [ 0.  50. ]]
95.3 91.41 100.0

85 91 85 100
Acc_91_Pre_85_Rec_100
[[40.8  9.2]
 [ 0.  50. ]]
90.8 84.46 100.0

80 87 80 100
Acc_87_Pre_80_Rec_100
[[36.2 13.8]
 [ 0.  50. ]]
86.2 78.37 100.0

75 83 75 100
Acc_83_Pre_75_Rec_100
[[33. 17.]
 [ 0. 50.]]
83.0 74.63 100.0

70 78 70 100
Acc_78_Pre_70_Rec_100
[[29.2 20.8]
 [ 0.  50. ]]
79.2 70.62 100.0

65 73 65 100
Acc_73_Pre_65_Rec_100
[[20.9 29.1]
 [ 0.  50. ]]
70.9 63.21 100.0

60 66 60 100
Acc_66_Pre_60_Rec_100
[[15.9 34.1]
 [ 0.  50. ]]
65.9 59.45 100.0

55 59 55 100
Acc_59_Pre_55_Rec_100
[[10.9 39.1]
 [ 0.  50. ]]
60.9 56.12 100.0

50 50 50 100
Acc_50_Pre_50_Rec_100
[[ 0. 50.]
 [ 0. 50.]]
50.0 50.0 100.0



# Create x-Features with Perfect Precision but Decreasing Recall by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>x/100$ and $y$ is True.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50 & 0 \cr
    \displaystyle 50\left(1 - \frac{x}{100}\right) & \displaystyle 50 \times \frac{x}{100} \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{100-x}{200}$$
- Precision is still 100\%.
- And Recall is now:
$$\frac{x}{100}$$



In [7]:
for x in range (100, 45, -5):
    a = int((100+x)/2)
    p = 100
    r = int(x)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > x/100) & (data['y']==True), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 97 100 95
90 95 100 90
85 92 100 85
80 90 100 80
75 87 100 75
70 85 100 70
65 82 100 65
60 80 100 60
55 77 100 55
50 75 100 50


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_97_Pre_100_Rec_95,Acc_95_Pre_100_Rec_90,Acc_92_Pre_100_Rec_85,Acc_90_Pre_100_Rec_80,Acc_87_Pre_100_Rec_75,Acc_85_Pre_100_Rec_70,Acc_82_Pre_100_Rec_65,Acc_80_Pre_100_Rec_60,Acc_77_Pre_100_Rec_55,Acc_75_Pre_100_Rec_50
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,False,True,False,False,True,False,...,True,True,True,True,True,False,True,True,True,True
996,True,True,False,True,True,False,False,True,True,False,...,True,True,True,True,True,True,True,True,False,False
997,True,True,True,True,True,True,True,True,False,False,...,True,True,True,True,False,False,True,True,False,True
998,True,True,True,True,True,True,False,False,False,False,...,True,True,True,True,True,True,True,True,False,False


In [8]:
for x in range (100, 45, -5):
    a = int((100+x)/2)
    p = 100
    r = int(x)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 97 100 95
Acc_97_Pre_100_Rec_95
[[50.   0. ]
 [ 2.6 47.4]]
97.4 100.0 94.8

90 95 100 90
Acc_95_Pre_100_Rec_90
[[50.  0.]
 [ 4. 46.]]
96.0 100.0 92.0

85 92 100 85
Acc_92_Pre_100_Rec_85
[[50.   0. ]
 [ 7.3 42.7]]
92.7 100.0 85.4

80 90 100 80
Acc_90_Pre_100_Rec_80
[[50.   0. ]
 [ 9.6 40.4]]
90.4 100.0 80.8

75 87 100 75
Acc_87_Pre_100_Rec_75
[[50.   0. ]
 [13.3 36.7]]
86.7 100.0 73.4

70 85 100 70
Acc_85_Pre_100_Rec_70
[[50.   0. ]
 [15.8 34.2]]
84.2 100.0 68.4

65 82 100 65
Acc_82_Pre_100_Rec_65
[[50.   0. ]
 [18.8 31.2]]
81.2 100.0 62.4

60 80 100 60
Acc_80_Pre_100_Rec_60
[[50.   0. ]
 [18.6 31.4]]
81.4 100.0 62.8

55 77 100 55
Acc_77_Pre_100_Rec_55
[[50.   0. ]
 [23.1 26.9]]
76.9 100.0 53.8

50 75 100 50
Acc_75_Pre_100_Rec_50
[[50.   0. ]
 [23.8 26.2]]
76.2 100.0 52.4



# Create x-Features with Perfect Recall but Decreasing Accuracy by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>(2x-100)/100$ and $y$ is False.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    \displaystyle 50\times \frac{2x-100}{100} & \displaystyle 50 \left( 1 -  \frac{2x-100}{100}\right) \cr
    0 & \displaystyle 50  \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{x}{100}$$
- Precision is now:
$$\frac{50}{150-x}$$
- And Recall is still 100\%.




In [9]:
for x in range (100, 45, -5):
    a = int(x)
    p = int(5000/(150-x))
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/100 ) & (data['y']==False), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 95 90 100
90 90 83 100
85 85 76 100
80 80 71 100
75 75 66 100
70 70 62 100
65 65 58 100
60 60 55 100
55 55 52 100
50 50 50 100


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_75_Pre_100_Rec_50,Acc_95_Pre_90_Rec_100,Acc_90_Pre_83_Rec_100,Acc_85_Pre_76_Rec_100,Acc_80_Pre_71_Rec_100,Acc_75_Pre_66_Rec_100,Acc_70_Pre_62_Rec_100,Acc_65_Pre_58_Rec_100,Acc_60_Pre_55_Rec_100,Acc_55_Pre_52_Rec_100
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,True,True
1,False,False,False,False,True,False,False,True,False,True,...,False,False,False,False,False,True,True,True,True,True
2,False,False,False,False,False,True,False,False,False,True,...,False,False,False,True,True,True,False,True,False,True
3,False,False,False,False,False,True,False,True,True,True,...,False,False,False,False,True,False,False,True,False,True
4,False,False,False,True,False,False,True,False,False,True,...,False,False,True,True,False,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,True,True,True,True
996,True,True,False,True,True,False,False,True,True,False,...,False,True,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True,False,False,...,True,True,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,False,False,False,False,...,False,True,True,True,True,True,True,True,True,True


In [10]:
for x in range (100, 45, -5):
    a = int(x)
    p = int(5000/(150-x))
    r = 100
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 95 90 100
Acc_95_Pre_90_Rec_100
[[44.4  5.6]
 [ 0.  50. ]]
94.4 89.93 100.0

90 90 83 100
Acc_90_Pre_83_Rec_100
[[40.2  9.8]
 [ 0.  50. ]]
90.2 83.61 100.0

85 85 76 100
Acc_85_Pre_76_Rec_100
[[35.3 14.7]
 [ 0.  50. ]]
85.3 77.28 100.0

80 80 71 100
Acc_80_Pre_71_Rec_100
[[29.8 20.2]
 [ 0.  50. ]]
79.8 71.23 100.0

75 75 66 100
Acc_75_Pre_66_Rec_100
[[25.4 24.6]
 [ 0.  50. ]]
75.4 67.02 100.0

70 70 62 100
Acc_70_Pre_62_Rec_100
[[20.3 29.7]
 [ 0.  50. ]]
70.3 62.74 100.0

65 65 58 100
Acc_65_Pre_58_Rec_100
[[14.1 35.9]
 [ 0.  50. ]]
64.1 58.21 100.0

60 60 55 100
Acc_60_Pre_55_Rec_100
[[11.5 38.5]
 [ 0.  50. ]]
61.5 56.5 100.0

55 55 52 100
Acc_55_Pre_52_Rec_100
[[ 5.2 44.8]
 [ 0.  50. ]]
55.2 52.74 100.0

50 50 50 100
Acc_50_Pre_50_Rec_100
[[ 0. 50.]
 [ 0. 50.]]
50.0 50.0 100.0



# Create x-Features with Perfect Precision but Decreasing Accuracy by Steps

- We start with a feature $y$ with $n$ rows, half True and half False.
- We have a list, $noise$, of random numbers in $[0,1)$.  
- We create a new feature for each value of $x \in \{100,90,\dots, 50\}$ that is $y$, except negating the values if $noise>(2x-100)/100$ and $y$ is True.
- The confusion matrix (as percentages) for $y$ on itself is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    0 & 50 \cr
\end{matrix}
\right]
$$
- The confusion martrix for the new feature is:
$$
\left[
\begin{matrix}
    50 & 0 \cr
    \displaystyle 50 \left( 1 -  \frac{2x-100}{100}\right) & \displaystyle 50\times \frac{2x-100}{100}  \cr
\end{matrix}
\right]
$$
- Accuracy is now:
$$\frac{x}{100}$$
- Precision is still 100%.
- And Recall is now:
$$\frac{2x-100}{100} = \frac{x-50}{50}$$





In [11]:
for x in range (100, 50, -5):
    a = int(x)
    p = 100
    r = int(100*(x-50)/50)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    noise = np.random.random(n)
    data[s] = np.where ( (noise > (2*x-100)/100 ) & (data['y']==True), np.logical_not(data['y']), data['y'] )
data

100 100 100 100
95 95 100 90
90 90 100 80
85 85 100 70
80 80 100 60
75 75 100 50
70 70 100 40
65 65 100 30
60 60 100 20
55 55 100 10


Unnamed: 0,y,Acc_100_Pre_100_Rec_100,Acc_95_Pre_95_Rec_95,Acc_90_Pre_90_Rec_90,Acc_85_Pre_85_Rec_85,Acc_80_Pre_80_Rec_80,Acc_75_Pre_75_Rec_75,Acc_70_Pre_70_Rec_70,Acc_65_Pre_65_Rec_65,Acc_60_Pre_60_Rec_60,...,Acc_80_Pre_71_Rec_100,Acc_75_Pre_66_Rec_100,Acc_70_Pre_62_Rec_100,Acc_65_Pre_58_Rec_100,Acc_60_Pre_55_Rec_100,Acc_55_Pre_52_Rec_100,Acc_70_Pre_100_Rec_40,Acc_65_Pre_100_Rec_30,Acc_60_Pre_100_Rec_20,Acc_55_Pre_100_Rec_10
0,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,True,False,False,False,False
1,False,False,False,False,True,False,False,True,False,True,...,False,True,True,True,True,True,False,False,False,False
2,False,False,False,False,False,True,False,False,False,True,...,True,True,False,True,False,True,False,False,False,False
3,False,False,False,False,False,True,False,True,True,True,...,True,False,False,True,False,True,False,False,False,False
4,False,False,False,True,False,False,True,False,False,True,...,False,True,True,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,False,False,False,False
996,True,True,False,True,True,False,False,True,True,False,...,True,True,True,True,True,True,False,False,False,False
997,True,True,True,True,True,True,True,True,False,False,...,True,True,True,True,True,True,False,False,False,False
998,True,True,True,True,True,True,False,False,False,False,...,True,True,True,True,True,True,False,True,False,True


In [12]:
for x in range (100, 50, -5):
    a = int(x)
    p = 100
    r = int(100*(x-50)/50)
    print (x, a, p, r)
    s = 'Acc_' + str(a) + '_Pre_' + str(p) + '_Rec_' + str(r)
    C = confusion_matrix(data['y'],data[s])
    acc = round(accuracy_score(data['y'], data[s])*100,2)
    rec = round(recall_score(data['y'], data[s])*100,2)
    pre = round(precision_score(data['y'], data[s])*100,2)

    print (s)
    print (C/n*100)
    print(acc, pre, rec)
    print ()

100 100 100 100
Acc_100_Pre_100_Rec_100
[[50.  0.]
 [ 0. 50.]]
100.0 100.0 100.0

95 95 100 90
Acc_95_Pre_100_Rec_90
[[50.   0. ]
 [ 3.8 46.2]]
96.2 100.0 92.4

90 90 100 80
Acc_90_Pre_100_Rec_80
[[50.   0. ]
 [ 9.4 40.6]]
90.6 100.0 81.2

85 85 100 70
Acc_85_Pre_100_Rec_70
[[50.   0. ]
 [15.2 34.8]]
84.8 100.0 69.6

80 80 100 60
Acc_80_Pre_100_Rec_60
[[50.   0. ]
 [20.1 29.9]]
79.9 100.0 59.8

75 75 100 50
Acc_75_Pre_100_Rec_50
[[50.   0. ]
 [25.4 24.6]]
74.6 100.0 49.2

70 70 100 40
Acc_70_Pre_100_Rec_40
[[50.  0.]
 [30. 20.]]
70.0 100.0 40.0

65 65 100 30
Acc_65_Pre_100_Rec_30
[[50.   0. ]
 [35.4 14.6]]
64.6 100.0 29.2

60 60 100 20
Acc_60_Pre_100_Rec_20
[[50.   0. ]
 [39.9 10.1]]
60.1 100.0 20.2

55 55 100 10
Acc_55_Pre_100_Rec_10
[[50.   0. ]
 [44.1  5.9]]
55.9 100.0 11.8



In [13]:
for row in data:
    print (row)

y
Acc_100_Pre_100_Rec_100
Acc_95_Pre_95_Rec_95
Acc_90_Pre_90_Rec_90
Acc_85_Pre_85_Rec_85
Acc_80_Pre_80_Rec_80
Acc_75_Pre_75_Rec_75
Acc_70_Pre_70_Rec_70
Acc_65_Pre_65_Rec_65
Acc_60_Pre_60_Rec_60
Acc_55_Pre_55_Rec_55
Acc_50_Pre_50_Rec_50
Acc_97_Pre_95_Rec_100
Acc_94_Pre_90_Rec_100
Acc_91_Pre_85_Rec_100
Acc_87_Pre_80_Rec_100
Acc_83_Pre_75_Rec_100
Acc_78_Pre_70_Rec_100
Acc_73_Pre_65_Rec_100
Acc_66_Pre_60_Rec_100
Acc_59_Pre_55_Rec_100
Acc_50_Pre_50_Rec_100
Acc_97_Pre_100_Rec_95
Acc_95_Pre_100_Rec_90
Acc_92_Pre_100_Rec_85
Acc_90_Pre_100_Rec_80
Acc_87_Pre_100_Rec_75
Acc_85_Pre_100_Rec_70
Acc_82_Pre_100_Rec_65
Acc_80_Pre_100_Rec_60
Acc_77_Pre_100_Rec_55
Acc_75_Pre_100_Rec_50
Acc_95_Pre_90_Rec_100
Acc_90_Pre_83_Rec_100
Acc_85_Pre_76_Rec_100
Acc_80_Pre_71_Rec_100
Acc_75_Pre_66_Rec_100
Acc_70_Pre_62_Rec_100
Acc_65_Pre_58_Rec_100
Acc_60_Pre_55_Rec_100
Acc_55_Pre_52_Rec_100
Acc_70_Pre_100_Rec_40
Acc_65_Pre_100_Rec_30
Acc_60_Pre_100_Rec_20
Acc_55_Pre_100_Rec_10


# Feature Selection

## Variance Threshold
- I don't think that variance is at all relevant to our situation.  

In [14]:
V = []
for row in data:
    v = data[row].var()
    v = round(v,4)
    V.append([v, row])
V = sorted(V, key=lambda x:x[0], reverse=True)
for row in V:
    print (row)

[0.2503, 'y']
[0.2503, 'Acc_100_Pre_100_Rec_100']
[0.2502, 'Acc_95_Pre_95_Rec_95']
[0.2502, 'Acc_75_Pre_75_Rec_75']
[0.2502, 'Acc_70_Pre_70_Rec_70']
[0.2501, 'Acc_90_Pre_90_Rec_90']
[0.2501, 'Acc_85_Pre_85_Rec_85']
[0.2498, 'Acc_50_Pre_50_Rec_50']
[0.2497, 'Acc_80_Pre_80_Rec_80']
[0.2496, 'Acc_97_Pre_100_Rec_95']
[0.2495, 'Acc_65_Pre_65_Rec_65']
[0.2495, 'Acc_97_Pre_95_Rec_100']
[0.2492, 'Acc_60_Pre_60_Rec_60']
[0.249, 'Acc_55_Pre_55_Rec_55']
[0.2488, 'Acc_95_Pre_100_Rec_90']
[0.248, 'Acc_94_Pre_90_Rec_100']
[0.2471, 'Acc_95_Pre_90_Rec_100']
[0.2449, 'Acc_92_Pre_100_Rec_85']
[0.2418, 'Acc_91_Pre_85_Rec_100']
[0.2414, 'Acc_90_Pre_100_Rec_80']
[0.2406, 'Acc_90_Pre_83_Rec_100']
[0.2325, 'Acc_87_Pre_100_Rec_75']
[0.2312, 'Acc_87_Pre_80_Rec_100']
[0.2286, 'Acc_85_Pre_76_Rec_100']
[0.2271, 'Acc_85_Pre_100_Rec_70']
[0.2213, 'Acc_83_Pre_75_Rec_100']
[0.2149, 'Acc_82_Pre_100_Rec_65']
[0.2098, 'Acc_80_Pre_100_Rec_60']
[0.2094, 'Acc_80_Pre_71_Rec_100']
[0.2069, 'Acc_78_Pre_70_Rec_100']
[0.1968, '

In [15]:
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
sel.fit_transform(data)
data_VT = data[data.columns[sel.get_support(indices=True)]]
data_VT.columns.symmetric_difference(data.columns)

Index(['Acc_50_Pre_50_Rec_100', 'Acc_55_Pre_100_Rec_10',
       'Acc_55_Pre_52_Rec_100'],
      dtype='object')

## SelectKBest
- Test first with the default scoring function, f_classif,
- Then test with $\chi^2$.

In [16]:
# Create and fit selector
selector = SelectKBest(f_classif, k='all')
selector.fit(data, data['y'])
scores = selector.scores_
Scores = []
for i, feature in enumerate(data):
    Scores.append([round(scores[i],0), feature])
Scores = sorted(Scores, key=lambda x:x[0], reverse=True)
for row in Scores:
    print (row)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_SKB = data.iloc[:,cols]
data_SKB.columns.symmetric_difference(data.columns)

[inf, 'y']
[inf, 'Acc_100_Pre_100_Rec_100']
[nan, 'Acc_50_Pre_50_Rec_100']
[9097.0, 'Acc_97_Pre_100_Rec_95']
[8412.0, 'Acc_97_Pre_95_Rec_100']
[6067.0, 'Acc_95_Pre_100_Rec_90']
[5348.0, 'Acc_95_Pre_95_Rec_95']
[4810.0, 'Acc_94_Pre_90_Rec_100']
[3956.0, 'Acc_95_Pre_90_Rec_100']
[2919.0, 'Acc_92_Pre_100_Rec_85']
[2213.0, 'Acc_91_Pre_85_Rec_100']
[2155.0, 'Acc_90_Pre_100_Rec_80']
[2084.0, 'Acc_90_Pre_90_Rec_90']
[2047.0, 'Acc_90_Pre_83_Rec_100']
[1377.0, 'Acc_87_Pre_100_Rec_75']
[1309.0, 'Acc_87_Pre_80_Rec_100']
[1198.0, 'Acc_85_Pre_76_Rec_100']
[1142.0, 'Acc_85_Pre_100_Rec_70']
[1113.0, 'Acc_85_Pre_85_Rec_85']
[969.0, 'Acc_83_Pre_75_Rec_100']
[828.0, 'Acc_82_Pre_100_Rec_65']
[742.0, 'Acc_80_Pre_100_Rec_60']
[736.0, 'Acc_80_Pre_71_Rec_100']
[701.0, 'Acc_78_Pre_70_Rec_100']
[587.0, 'Acc_80_Pre_80_Rec_80']
[581.0, 'Acc_77_Pre_100_Rec_55']
[515.0, 'Acc_75_Pre_66_Rec_100']
[483.0, 'Acc_75_Pre_100_Rec_50']
[358.0, 'Acc_73_Pre_65_Rec_100']
[341.0, 'Acc_70_Pre_62_Rec_100']
[333.0, 'Acc_70_Pre_10

  f = msb / msw
  f = msb / msw


Index([], dtype='object')

In [17]:
# Create and fit selector
selector = SelectKBest(chi2, k='all')
selector.fit(data, data['y'])
scores = selector.scores_
Scores = []
for i, feature in enumerate(data):
    Scores.append([round(scores[i],0), feature])
Scores = sorted(Scores, key=lambda x:x[0], reverse=True)
for row in Scores:
    print (row)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_SKB = data.iloc[:,cols]
data_SKB.columns.symmetric_difference(data.columns)

[500.0, 'y']
[500.0, 'Acc_100_Pre_100_Rec_100']
[474.0, 'Acc_97_Pre_100_Rec_95']
[462.0, 'Acc_95_Pre_100_Rec_90']
[427.0, 'Acc_92_Pre_100_Rec_85']
[422.0, 'Acc_97_Pre_95_Rec_100']
[421.0, 'Acc_95_Pre_95_Rec_95']
[406.0, 'Acc_90_Pre_100_Rec_80']
[375.0, 'Acc_94_Pre_90_Rec_100']
[367.0, 'Acc_87_Pre_100_Rec_75']
[355.0, 'Acc_95_Pre_90_Rec_100']
[348.0, 'Acc_85_Pre_100_Rec_70']
[329.0, 'Acc_90_Pre_90_Rec_90']
[312.0, 'Acc_82_Pre_100_Rec_65']
[299.0, 'Acc_80_Pre_100_Rec_60']
[281.0, 'Acc_91_Pre_85_Rec_100']
[270.0, 'Acc_90_Pre_83_Rec_100']
[269.0, 'Acc_85_Pre_85_Rec_85']
[269.0, 'Acc_77_Pre_100_Rec_55']
[246.0, 'Acc_75_Pre_100_Rec_50']
[205.0, 'Acc_87_Pre_80_Rec_100']
[200.0, 'Acc_70_Pre_100_Rec_40']
[194.0, 'Acc_80_Pre_80_Rec_80']
[193.0, 'Acc_85_Pre_76_Rec_100']
[163.0, 'Acc_83_Pre_75_Rec_100']
[146.0, 'Acc_65_Pre_100_Rec_30']
[127.0, 'Acc_80_Pre_71_Rec_100']
[120.0, 'Acc_78_Pre_70_Rec_100']
[107.0, 'Acc_75_Pre_75_Rec_75']
[101.0, 'Acc_60_Pre_100_Rec_20']
[86.0, 'Acc_75_Pre_66_Rec_100']
[

Index([], dtype='object')

In [18]:
# Create and fit selector
selector = SelectKBest(mutual_info_classif, k='all')
selector.fit(data, data['y'])
scores = selector.scores_
Scores = []
for i, feature in enumerate(data):
    Scores.append([round(scores[i],0), feature])
Scores = sorted(Scores, key=lambda x:x[0], reverse=True)
for row in Scores:
    print (row)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
data_SKB = data.iloc[:,cols]
data_SKB.columns.symmetric_difference(data.columns)

[1.0, 'y']
[1.0, 'Acc_100_Pre_100_Rec_100']
[1.0, 'Acc_95_Pre_95_Rec_95']
[1.0, 'Acc_97_Pre_95_Rec_100']
[1.0, 'Acc_94_Pre_90_Rec_100']
[1.0, 'Acc_97_Pre_100_Rec_95']
[1.0, 'Acc_95_Pre_100_Rec_90']
[1.0, 'Acc_95_Pre_90_Rec_100']
[0.0, 'Acc_90_Pre_90_Rec_90']
[0.0, 'Acc_85_Pre_85_Rec_85']
[0.0, 'Acc_80_Pre_80_Rec_80']
[0.0, 'Acc_75_Pre_75_Rec_75']
[0.0, 'Acc_70_Pre_70_Rec_70']
[0.0, 'Acc_65_Pre_65_Rec_65']
[0.0, 'Acc_60_Pre_60_Rec_60']
[0.0, 'Acc_55_Pre_55_Rec_55']
[0.0, 'Acc_50_Pre_50_Rec_50']
[0.0, 'Acc_91_Pre_85_Rec_100']
[0.0, 'Acc_87_Pre_80_Rec_100']
[0.0, 'Acc_83_Pre_75_Rec_100']
[0.0, 'Acc_78_Pre_70_Rec_100']
[0.0, 'Acc_73_Pre_65_Rec_100']
[0.0, 'Acc_66_Pre_60_Rec_100']
[0.0, 'Acc_59_Pre_55_Rec_100']
[0.0, 'Acc_50_Pre_50_Rec_100']
[0.0, 'Acc_92_Pre_100_Rec_85']
[0.0, 'Acc_90_Pre_100_Rec_80']
[0.0, 'Acc_87_Pre_100_Rec_75']
[0.0, 'Acc_85_Pre_100_Rec_70']
[0.0, 'Acc_82_Pre_100_Rec_65']
[0.0, 'Acc_80_Pre_100_Rec_60']
[0.0, 'Acc_77_Pre_100_Rec_55']
[0.0, 'Acc_75_Pre_100_Rec_50']
[0.0,

Index([], dtype='object')