## Feature Selection

In [None]:
import pandas as pd
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score,f1_score
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from math import sqrt

In [None]:
data1 = pd.read_csv("training.csv")

## Feature Engineering

#### Convert Time to Datetime format

In [None]:
data1['TransactionStartTime'] = pd.to_datetime(data1['TransactionStartTime'], errors='coerce')

#### Extracting features from Datetime

In [None]:
#Extracting Month,Day,Weekday,Hour,Minute from training
data1['Month'] = data1['TransactionStartTime'].dt.month
data1['Weekday'] = data1['TransactionStartTime'].dt.weekday
data1['Day'] = data1['TransactionStartTime'].dt.day
data1['Hour'] = data1['TransactionStartTime'].dt.hour
data1['Minute'] = data1['TransactionStartTime'].dt.minute
data1['Seconds'] = data1['TransactionStartTime'].dt.second

In [None]:
data1=pd.DataFrame(data1)

In [None]:
data1.head(1)

#### Drop Columns

In [None]:
columns_to_drop=['Value','TransactionId','BatchId','AccountId','SubscriptionId','CustomerId','CurrencyCode','CountryCode','TransactionStartTime']
data=data1.drop(columns_to_drop, axis=1)

In [None]:
data.head(1)

#### Get dummies for categorical

In [None]:
#getting categorical dummies
columns= ["ProviderId", "ProductCategory", "ProductId", "ChannelId"]
data = pd.get_dummies(data, columns=columns)

#### Changing Amount column set to credit(0) and Debit(1) 

In [None]:
data['Amount']=data['Amount'].apply(lambda x: 0 if(x<0) else 1)

#### Label Encoder

LabelEncoder converts each class under specified feature to a numerical value.

In [None]:
# convert object types into integer types 
'''le = preprocessing.LabelEncoder()
data['TransactionId'] = le.fit_transform(data['TransactionId'])
data['BatchId'] = le.fit_transform(data['BatchId'])
data['AccountId']  = le.fit_transform(data['AccountId'])
data['SubscriptionId']= le.fit_transform( data['SubscriptionId']) 
data['CustomerId']= le.fit_transform( data['CustomerId'])
data['CurrencyCode']= le.fit_transform( data['CurrencyCode'])
data['ProviderId']= le.fit_transform( data['ProviderId'])
data['ProductCategory']= le.fit_transform( data['ProductCategory'])
data['ProductId']= le.fit_transform( data['ProductId'])
data['ChannelId']= le.fit_transform( data['ChannelId'])'''

In [None]:
data.head(1)

### Balancing the Dataset ### 

Here we'll use **Synthetic Oversampling:** This method helps to avoid overfitting. In this method, a small subset of minority is chosen and synthetic examples of this subset are created to balance up the overall dataset. This adds new information to the dataset and increases the overall number of observations.
We use ***SMOTE*** (synthetic minority oversampling technique) to balance the dataset.

In [None]:
X1=data.drop(['FraudResult'], axis=1)
y1=data[['FraudResult']]

In [None]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y1, 
                                                    test_size=0.2, 
                                                    random_state=30)

In [None]:
smote_algo=SMOTE(random_state=0)
smote_data_X,smote_data_Y=smote_algo.fit_sample(X_train,y_train)
smote_data_X=pd.DataFrame(data=smote_data_X, columns=X_train.columns)
smote_data_Y=pd.DataFrame(data=smote_data_Y,columns=['FraudResult'])

#Join X and Y smote data into one
smote_data=smote_data_X
smote_data['FraudResult']=smote_data_Y['FraudResult']

#Count of fraud and non-fraud cases on smote after resampling
print("Fraud: {}".format((smote_data["FraudResult"]==1).sum()))
print("Non-Fraud: {}".format((smote_data["FraudResult"]==0).sum()))

In [None]:
#Visualizing Fraud and Non-Fraud Transactions
fig, ax = plt.subplots()
g = sns.countplot(smote_data.FraudResult, palette='viridis')
g.set_xticklabels(['Not Fraud', 'Fraud'])
g.set_yticklabels([])

### Spliting Dataset

In [None]:
X=smote_data.drop(['FraudResult'], axis=1)
y=smote_data[['FraudResult']]

### Model Development

In [None]:
# split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state= 42)

#### Logistic Regression

In [None]:
LR = LogisticRegression(C=0.01,solver='lbfgs').fit(X_train, y_train)
LR_predict = LR.predict(X_test)

print('The Logistic Regression F1-score score is {}'.format(f1_score(y_test,LR_predict,average='weighted')))
print('The Logistic Regression Accuracy score is {}'.format(accuracy_score(y_test,LR_predict)))

#### Random Forest Classifier

In [None]:
forest_clas = RandomForestClassifier(n_estimators=10, random_state=42)
forest_clas.fit(X_train, y_train)

RF_predict = forest_clas.predict(X_test)

print('The  RFC F1-score score is {}'.format(f1_score(y_test,RF_predict,average='weighted')))
print('The  RFC Accuracy score is {}'.format(accuracy_score(y_test,RF_predict)))

#### SVR

In [None]:
'''from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(X_train, y_train)
svm_predictions = svm_reg.predict(X_test)

print('The  SVR F1-score score is {}'.format(f1_score(y_test,svm_predictions,average='weighted')))
print('The  SVR Accuracy score is {}'.format(accuracy_score(y_test,svm_predictions)))'''

### KNN

In [None]:
Ks = 8
mean_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    KNN_predict=neigh.predict(X_test)
    
    print('The  KNN F1-score score is {}'.format(f1_score(y_test,KNN_predict,average='weighted')))
    print('The  KNN Accuracy score is {}'.format(accuracy_score(y_test,KNN_predict)))

### Normalize Data

In [None]:
'''X= preprocessing.StandardScaler().fit(X).transform(X)'''

In [None]:
# train the normalization
'''from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(X)'''

### Check accuracy before Feature selection

In [None]:
'''cv = KFold(n_splits=10, random_state=0, shuffle=False)
classifier_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=10))
y_pred = cross_val_predict(classifier_pipeline, X, y, cv=cv)
print("RMSE: " + str(round(sqrt(mean_squared_error(y,y_pred)),2)))
print("R_squared: " + str(round(r2_score(y,y_pred),2)))'''

##  Feature selection techniques.

1. Univariate Selection

2. Feature Importance

3. Correlation Matrix with Heatmap

4. Variance

## Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.

The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.

The example below uses the chi-squared (chi²) statistical test for non-negative features to select 10 of the best features from the Mobile Price Range Prediction Dataset.



In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Columns','Score']  #naming the dataframe columns

In [None]:
print(featureScores.nlargest(20,'Score'))  #print 10 best features

## Feature Importance
You can get the feature importance of each feature of your dataset by using the feature importance property of the model.

Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.

Feature importance is an inbuilt class that comes with Tree Based Classifiers, we will be using Extra Tree Classifier for extracting the top 10 features for the dataset.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)

In [None]:
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

## Correlation Matrix with Heatmap

Correlation states how the features are related to each other or the target variable.

Correlation can be positive (increase in one value of feature increases the value of the target variable) or negative (increase in one value of feature decreases the value of the target variable)

Heatmap makes it easy to identify which features are most related to the target variable, we will plot heatmap of correlated features using the seaborn library.

In [None]:
import seaborn as sns
#get correlations of each features in dataset
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
abs(data.corr()["FraudResult"])

<h3>Filter Features by Variance</h3>

In [None]:
data.var()

In [None]:
best_features=['']
X=smote_data(columns = best_features)
y=smote_data[['FraudResult']]

In [None]:
# split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state= 42)

In [None]:
#Model Accuracy after Selection
LR = LogisticRegression(C=0.01,solver='lbfgs').fit(X_train, y_train)
LR_predict = LR.predict(X_test)

print('The Logistic Regression F1-score score is {}'.format(f1_score(y_test,LR_predict,average='weighted')))
print('The Logistic Regression Accuracy score is {}'.format(accuracy_score(y_test,LR_predict)))