In [1]:
import os
os.chdir(r"C:\Users\vish\Documents\Data\Dataset")
import pandas as pd

In [2]:
#Pearon's correlation
# A value of 0 means no correlation. The value must be interpreted, where often a value below -0.5 or above 0.5 
# indicates a notable correlation, and values below those values suggests a less notable correlation
# f_regression: Used only for numeric targets and based on linear regression performance
# null hypotheses - the samples are uncorrelated/independent.
# alternate hypotheses - the samples are correlated/dependent.
#if the p-value is less than significance level, we can reject null hypotheses and accept alternate hypotheses

In [2]:
df=pd.read_csv("Insurance.csv")

In [3]:
from scipy.stats import pearsonr
corr, p = pearsonr(df['bmi'], df['charges'])
print('Pearsons correlation: %.3f' % corr)
print('P value : %.3f' % p)

alpha = 0.05
if p > alpha:
    print('We accept null hypotheses(H0)')
else:
    print('We reject null hypotheses(H0) and accept alternate hypotheses(H1)')

Pearsons correlation: 0.198
P value : 0.000
We reject null hypotheses(H0) and accept alternate hypotheses(H1)


In [4]:
df2=df[['bmi','charges']]
df2.corr()

Unnamed: 0,bmi,charges
bmi,1.0,0.198341
charges,0.198341,1.0


In [5]:
#Spearman's coefficient
#The function takes two real-valued samples as arguments and 
# returns both the correlation coefficient in the range between -1 and 1 
# and the p-value for interpreting the significance of the coefficient.
# null hypotheses - the samples are uncorrelated/independent.
# alternate hypotheses - the samples are correlated/dependent.
#if the p-value is less than significance level, we can reject null hypotheses and accept alternate hypotheses

In [6]:
df=pd.read_csv("Insurance.csv")

In [7]:
from scipy.stats import spearmanr
corr, p = spearmanr(df['bmi'], df['charges'])
print('Spearman correlation: %.3f' % corr)
print('P value: %.3f' % p)

# interpret the significance
alpha = 0.05
if p > alpha:
    print('We accept null hypotheses(H0)')
else:
    print('We reject null hypotheses(H0) and accept alternate hypotheses(H1)')

Spearman correlation: 0.119
P value: 0.000
We reject null hypotheses(H0) and accept alternate hypotheses(H1)


In [8]:
df2=df[['bmi','charges']]
df2.corr(method='spearman')

Unnamed: 0,bmi,charges
bmi,1.0,0.119396
charges,0.119396,1.0


In [9]:
#Pearson’s Chi-Square 
# print("Null Hypotheses - Variables are uncorrelated/independent")
# print("Alternate Hypotheses - Variables are correlated/dependent")
# print("If statistics/chi value is less than critical value, accept null hypotheses")
# print("If statistics/chi value is more than or equal to critical value, reject null hypotheses")
# print("if p value is more than significance, accept null hypotheses")
# print("if p value is less than or equal to significance, reject null hypotheses")

In [10]:
df=pd.read_csv("Titanic data.csv")

In [18]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2
crosstab =pd.crosstab(df["SibSp"], df["Survived"])
stat, p, dof, expected=chi2_contingency(crosstab)
alpha = 0.05
level_of_significance = 1 - alpha
critical = chi2.ppf(level_of_significance, dof)

print('Dataset')
print(crosstab)
print('alpha=%.2f, level_of_significance=%.2f, critical=%.3f, stat=%.3f' % (alpha, level_of_significance, critical, stat))

# interpret test-statistic
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

# interpret p-value
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

Dataset
Survived    0    1
SibSp             
0         398  210
1          97  112
2          15   13
3          12    4
4          15    3
5           5    0
8           7    0
alpha=0.05, level_of_significance=0.95, critical=12.592, stat=37.272
Dependent (reject H0)
Dependent (reject H0)


In [16]:
# f_regression
# Used only for numeric targets and based on linear regression performance

In [17]:
#Numeric input Numeric output

In [18]:
df=pd.read_csv("Sonar.csv",header=None)
df=df[[0,1,2,3,4,5,6,7,8,9,10]]

In [19]:
from sklearn import datasets
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
X = df.iloc[:,0:10].values
y = df.iloc[:,10].values
f_regression_selector = SelectKBest(f_regression, k=2)
X_kbest = f_regression_selector.fit_transform(X, y)
print('Original number of features:', X.shape)
print('Reduced number of features:', X_kbest.shape)

Original number of features: (208, 10)
Reduced number of features: (208, 2)


In [20]:
#categorical input numeric output

In [21]:
df=pd.read_csv('Insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
# Import label encoder 
from sklearn import preprocessing 
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
df['sex']= label_encoder.fit_transform(df['sex'])

In [23]:
# Import label encoder 
from sklearn import preprocessing 
# label_encoder object knows how to understand word labels. 
label_encoder2 = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
df['smoker']= label_encoder2.fit_transform(df['smoker'])

In [24]:
df['sex']=pd.Categorical(df['sex'])
df['children'] = pd.Categorical(df['children'])
df=df[['sex','smoker','children','charges']]

In [25]:
from sklearn import datasets
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
X = df.iloc[:,0:3].values
y = df.iloc[:,3].values
f_regression_selector = SelectKBest(f_regression, k=2)
X_kbest = f_regression_selector.fit_transform(X, y)
print('Original number of features:', X.shape)
print('Reduced number of features:', X_kbest.shape)

Original number of features: (1338, 3)
Reduced number of features: (1338, 2)


In [26]:
#chi2
#Performs the chi-square statistic for categorical targets which is less sensible to the nonlinear relationship between the predictive variable and its target.

In [27]:
#categorical input categorical output

In [28]:
df=pd.read_csv("Titanic data.csv")

In [29]:
from sklearn import preprocessing  
label_encoder = preprocessing.LabelEncoder()  
df['Sex']= label_encoder.fit_transform(df['Sex'])

In [30]:
df['SibSp']=pd.Categorical(df['SibSp'])
df['Survived']=pd.Categorical(df['Survived'])
df['Pclass']=pd.Categorical(df['Pclass'])
df['Sex']=pd.Categorical(df['Sex'])

In [31]:
df=df[['SibSp','Pclass','Sex','Survived']]

In [32]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = df.iloc[:,0:3].values
y = df.iloc[:,3].values

fvalue_selector = SelectKBest(chi2, k=2)

X_kbest = fvalue_selector.fit_transform(X, y)

print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

Original number of features: 3
Reduced number of features: 2


In [33]:
#numeric input categorical output

In [34]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Load iris data
iris = load_iris()

# Create features and target
X = iris.data
y = iris.target

# Convert to categorical data by converting data to integers
X = X.astype(int)

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
X_kbest = chi2_selector.fit_transform(X, y)

# Show results
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [35]:
#f_classif
#Used only for categorical targets and based on the Analysis of Variance (ANOVA) statistical test.

In [36]:
#categorical input categorical output

In [37]:
df=pd.read_csv("Titanic data.csv")

In [38]:
from sklearn import preprocessing  
label_encoder = preprocessing.LabelEncoder()  
df['Sex']= label_encoder.fit_transform(df['Sex'])

In [39]:
df['SibSp']=pd.Categorical(df['SibSp'])
df['Survived']=pd.Categorical(df['Survived'])
df['Pclass']=pd.Categorical(df['Pclass'])
df['Sex']=pd.Categorical(df['Sex'])

In [40]:
df=df[['SibSp','Pclass','Sex','Survived']]

In [41]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X = df.iloc[:,0:3].values
y = df.iloc[:,3].values

fvalue_selector = SelectKBest(f_classif, k=2)

X_kbest = fvalue_selector.fit_transform(X, y)

print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

Original number of features: 3
Reduced number of features: 2


In [42]:
#numeric input categorical output

In [43]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

iris = load_iris()
X = iris.data
y = iris.target

# Create an SelectKBest object to select features with two best ANOVA F-Values
fvalue_selector = SelectKBest(f_classif, k=2)

# Apply the SelectKBest object to the features and target
X_kbest = fvalue_selector.fit_transform(X, y)

print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [4]:
#Mutual Information
# Mutual information from the field of information theory is the application of information gain to feature selection.
# Used when target variable is categorical
# Mutual information is calculated between two variables and measures the reduction in uncertainty for one variable given a known value of the other variable
# Mutual information is straightforward when considering the distribution of two discrete (categorical or ordinal) variables such as categorical input and categorical output data.
# Nevertheless, it can be adapted for use with numerical input and categorical output

In [5]:
#categorical input categorical output

In [6]:
df=pd.read_csv("Titanic data.csv")

In [7]:
from sklearn import preprocessing  
label_encoder = preprocessing.LabelEncoder()  
df['Sex']= label_encoder.fit_transform(df['Sex'])

In [8]:
df['SibSp']=pd.Categorical(df['SibSp'])
df['Survived']=pd.Categorical(df['Survived'])
df['Pclass']=pd.Categorical(df['Pclass'])
df['Sex']=pd.Categorical(df['Sex'])

In [9]:
df=df[['SibSp','Pclass','Sex','Survived']]

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

X = df.iloc[:,0:3].values
y = df.iloc[:,3].values

fvalue_selector = SelectKBest(mutual_info_classif, k=2)

X_kbest = fvalue_selector.fit_transform(X, y)

print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

Original number of features: 3
Reduced number of features: 2


In [12]:
#numeric input categorical output

In [14]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

iris = load_iris()
X = iris.data
y = iris.target

# Create an SelectKBest object to select features with two best ANOVA F-Values
fvalue_selector = SelectKBest(mutual_info_classif, k=2)

# Apply the SelectKBest object to the features and target
X_kbest = fvalue_selector.fit_transform(X, y)

print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [11]:
#Recursive Feature Elimination

In [1]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
dataset = datasets.load_iris()
model = LogisticRegression()
rfe = RFE(model, 3)
rfe = rfe.fit(dataset.data, dataset.target)
print(rfe.support_)
print(rfe.ranking_)

[False  True  True  True]
[2 1 1 1]


In [2]:
from sklearn import datasets
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
import pandas as pd
dataset = datasets.load_iris()
model = LogisticRegression()
rfecv = RFECV(estimator=model, cv=10,scoring="accuracy")
rfecv=rfecv.fit(dataset.data, dataset.target)
print(rfecv.support_)
print(rfecv.ranking_)

[ True  True  True  True]
[1 1 1 1]


In [3]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import pandas as pd
dataset = datasets.load_boston()
model = LinearRegression()
rfe = RFE(model, 5)
rfe = rfe.fit(dataset.data, dataset.target)
print(rfe.support_)
print(rfe.ranking_)

[False False False  True  True  True False  True False False  True False
 False]
[4 6 5 1 1 1 9 1 3 7 1 8 2]


In [5]:
from sklearn import datasets
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
import pandas as pd
dataset = datasets.load_boston()
model = LinearRegression()
selector = RFECV(estimator=model, cv=10,scoring="neg_mean_squared_error")
selector=selector.fit(dataset.data, dataset.target)
print(selector.support_)
print(selector.ranking_)

[False False False  True  True  True False  True False False  True False
  True]
[3 5 4 1 1 1 8 1 2 6 1 7 1]


In [16]:
#Boruta

In [31]:
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy as bp
from sklearn.datasets import load_boston
dataset = load_boston()
dataset_df=pd.DataFrame(dataset.data, columns=dataset.feature_names)
print(dataset_df.columns)
X = dataset.data
y = dataset.target
rf_model = RandomForestRegressor(n_jobs= 4,oob_score= True)
feat_selector = bp(rf_model,n_estimators = 'auto', verbose= 0,max_iter= 100)
feat_selector.fit(X, y)
selected_features = [dataset.feature_names[i] for i, x in enumerate(feat_selector.support_) if x]
print(feat_selector.n_features_)
print(selected_features)

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')
9
['CRIM', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [32]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy as bp
from sklearn.datasets import load_iris
dataset = load_iris()
dataset_df=pd.DataFrame(dataset.data, columns=dataset.feature_names)
print(dataset_df.columns)
X = dataset.data
y = dataset.target
rf_model = RandomForestClassifier(n_jobs= 4,oob_score= True)
feat_selector = bp(rf_model,n_estimators = 'auto', verbose= 0,max_iter= 100)
feat_selector.fit(X, y)
selected_features = [dataset.feature_names[i] for i, x in enumerate(feat_selector.support_) if x]
print(feat_selector.n_features_)
print(selected_features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
4
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [70]:
#Intrinsic

In [71]:
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
dataset = datasets.load_iris()
model = ExtraTreesClassifier()
model.fit(dataset.data, dataset.target)
print(model.feature_importances_)

  from numpy.core.umath_tests import inner1d


[0.1668905  0.04699898 0.43051088 0.35559964]


In [45]:
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
dataset = datasets.load_iris()
model = RandomForestClassifier()
model.fit(dataset.data, dataset.target)
print(model.feature_importances_)

[0.0818003  0.03399452 0.50673294 0.37747225]


  from numpy.core.umath_tests import inner1d
