### Label Encoder

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
data =pd.read_csv("iris.csv",index_col=0)

In [3]:
data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [4]:
#Changes does not effect the "data" dataframe
data1=data.copy()

In [5]:
labelencoder = LabelEncoder()
data1.iloc[:, -1] = labelencoder.fit_transform(data1.iloc[:,-1])

In [6]:
data1

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,2
147,6.3,2.5,5.0,1.9,2
148,6.5,3.0,5.2,2.0,2
149,6.2,3.4,5.4,2.3,2


### One Hot Encoder

#### Using sklearn

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
data2=pd.read_csv("iris.csv",index_col=0)

In [9]:
data2

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,virginica
147,6.3,2.5,5.0,1.9,virginica
148,6.5,3.0,5.2,2.0,virginica
149,6.2,3.4,5.4,2.3,virginica


In [10]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

In [11]:
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(data2[['Species']]).toarray())


In [12]:
enc_df

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
145,0.0,0.0,1.0
146,0.0,0.0,1.0
147,0.0,0.0,1.0
148,0.0,0.0,1.0


In [13]:
# merge with main df 
data_final = data2.iloc[:,0:4].join(enc_df)
data_final

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,0,1,2
1,5.1,3.5,1.4,0.2,1.0,0.0,0.0
2,4.9,3.0,1.4,0.2,1.0,0.0,0.0
3,4.7,3.2,1.3,0.2,1.0,0.0,0.0
4,4.6,3.1,1.5,0.2,1.0,0.0,0.0
5,5.0,3.6,1.4,0.2,1.0,0.0,0.0
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0.0,0.0,1.0
147,6.3,2.5,5.0,1.9,0.0,0.0,1.0
148,6.5,3.0,5.2,2.0,0.0,0.0,1.0
149,6.2,3.4,5.4,2.3,0.0,0.0,1.0


#### Using Pandas

In [14]:
import pandas as pd

In [15]:
data3 =pd.read_csv("iris.csv",index_col=0)

In [16]:
data_encoded=pd.get_dummies(data3)

In [17]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0,0,1
147,6.3,2.5,5.0,1.9,0,0,1
148,6.5,3.0,5.2,2.0,0,0,1
149,6.2,3.4,5.4,2.3,0,0,1


#### IsolationForest

In [18]:
from sklearn.ensemble import IsolationForest

In [19]:
data =pd.read_csv("iris.csv",index_col=0)
data_encoded=pd.get_dummies(data)

In [20]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(data_encoded)



IsolationForest(contamination=0.01, random_state=10)

In [21]:
# predictions
y_pred_outliers = clf.predict(data_encoded)

In [22]:
#-1 for outliers and 1 for inliers.
y_pred_outliers

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [23]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0,0,1
147,6.3,2.5,5.0,1.9,0,0,1
148,6.5,3.0,5.2,2.0,0,0,1
149,6.2,3.4,5.4,2.3,0,0,1


In [24]:
## Let us add a new data point which is outlier
data_encoded.loc[150]=[20,40,30,50,1,0,0]

In [25]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0,0,1
147,6.3,2.5,5.0,1.9,0,0,1
148,6.5,3.0,5.2,2.0,0,0,1
149,6.2,3.4,5.4,2.3,0,0,1


In [26]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(data_encoded)
# predictions
y_pred_outliers = clf.predict(data_encoded)
y_pred_outliers



array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1])

In [27]:
data_encoded['scores']=clf.decision_function(data_encoded)

In [28]:
data_encoded['anomaly']=clf.predict(data_encoded.iloc[:,0:7])

In [29]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica,scores,anomaly
1,5.1,3.5,1.4,0.2,1,0,0,0.202073,1
2,4.9,3.0,1.4,0.2,1,0,0,0.185411,1
3,4.7,3.2,1.3,0.2,1,0,0,0.174012,1
4,4.6,3.1,1.5,0.2,1,0,0,0.182873,1
5,5.0,3.6,1.4,0.2,1,0,0,0.199432,1
...,...,...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0,0,1,0.164313,1
147,6.3,2.5,5.0,1.9,0,0,1,0.133436,1
148,6.5,3.0,5.2,2.0,0,0,1,0.179224,1
149,6.2,3.4,5.4,2.3,0,0,1,0.125229,1


In [30]:
#Print the outlier data points
data_encoded[data_encoded['anomaly']==-1]

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica,scores,anomaly
107,4.9,2.5,4.5,1.7,0,0,1,-0.009111,-1
150,20.0,40.0,30.0,50.0,1,0,0,-0.287374,-1


### PPS score

In [31]:
#install the package
!pip install ppscore

Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: ppscore
  Building wheel for ppscore (setup.py): started
  Building wheel for ppscore (setup.py): finished with status 'done'
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13167 sha256=a1ea9a649b5c089866968d9822726e3cf13e8e28bbfe9edb89014c065a75f8a5
  Stored in directory: c:\users\dell\appdata\local\pip\cache\wheels\d8\2d\fc\c1699298a1241684a460b125835f7871ee8e3ab3afea9b5d6f
Successfully built ppscore
Installing collected packages: ppscore
Successfully installed ppscore-1.3.0


In [32]:
import ppscore as pps

In [35]:
#pps.score(df, "feature_column", "target_column")  syntax
pps.score(data, "Sepal.Length", "Species")

{'x': 'Sepal.Length',
 'y': 'Species',
 'ppscore': 0.47164941668027915,
 'case': 'classification',
 'is_valid_score': True,
 'metric': 'weighted F1',
 'baseline_score': 0.35333333333333333,
 'model_score': 0.6583332894532472,
 'model': DecisionTreeClassifier()}

In [36]:
#calculate the whole PPS matrix
pps.matrix(data)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sepal.Length,Sepal.Length,1.0,predict_itself,True,,0.0,1.0,
1,Sepal.Length,Sepal.Width,0.0,regression,True,mean absolute error,0.330667,0.364704,DecisionTreeRegressor()
2,Sepal.Length,Petal.Length,0.550423,regression,True,mean absolute error,1.488667,0.669271,DecisionTreeRegressor()
3,Sepal.Length,Petal.Width,0.431739,regression,True,mean absolute error,0.644667,0.366339,DecisionTreeRegressor()
4,Sepal.Length,Species,0.471649,classification,True,weighted F1,0.353333,0.658333,DecisionTreeClassifier()
5,Sepal.Width,Sepal.Length,0.006966,regression,True,mean absolute error,0.684667,0.679897,DecisionTreeRegressor()
6,Sepal.Width,Sepal.Width,1.0,predict_itself,True,,0.0,1.0,
7,Sepal.Width,Petal.Length,0.172375,regression,True,mean absolute error,1.488667,1.232058,DecisionTreeRegressor()
8,Sepal.Width,Petal.Width,0.132858,regression,True,mean absolute error,0.644667,0.559017,DecisionTreeRegressor()
9,Sepal.Width,Species,0.156915,classification,True,weighted F1,0.353333,0.454805,DecisionTreeClassifier()
