In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
data = pd.read_csv('task_b.csv')
data=data.iloc[:,1:]

In [3]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [4]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [5]:
data.std()

f1      488.195035
f2    10403.417325
f3        2.926662
y         0.501255
dtype: float64

In [6]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


# What if our features are with different variance 

<pre>
* <b>As part of this task you will observe how linear models work in case of data having feautres with different variance</b>
* <b>from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)</b>

> <b>Task1</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>Task2</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

## Task 1

In [7]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


### LR

In [8]:
clf = SGDClassifier(loss = 'log',random_state = 13)

In [9]:
clf.fit(X,Y)

SGDClassifier(loss='log', random_state=13)

In [10]:
weights = clf.coef_

In [11]:
# Getting the weight coeffcient corresponding to each feature
coef_dict = {}
for coef, feature in zip(weights[0,:],['f1','f2','f3']): 
    coef_dict[feature] = coef

In [12]:
coef_dict

{'f1': 9800.352249355115, 'f2': 34130.59079875135, 'f3': 9490.039168321906}

### SVM

In [13]:
clf = SGDClassifier(loss = 'hinge',random_state = 13)

In [14]:
clf.fit(X,Y)

SGDClassifier(random_state=13)

In [15]:
weights = clf.coef_

In [16]:
# Getting the weight coeffcient corresponding to each feature
coef_dict = {}
for coef, feat in zip(weights[0,:],['f1','f2','f3']): 
    coef_dict[feat] = coef

In [17]:
coef_dict

{'f1': 12482.987035789718, 'f2': 34021.3775278003, 'f3': 9595.719869314591}

### Observations -

1. We can observe from above feature weights that features having high SD tends to have more weightage.
2. Feature with high correlation to class label have less weightage in predicting the class label which is not good as it contains a linear relationship with y.
3. Both LR and SVM have the same pattern in weights : f2 > f1> f3.

## Task 2

In [18]:
# STANDARDIZING THE DATA
X = StandardScaler().fit_transform(data[['f1','f2','f3']])
X.shape

(200, 3)

In [30]:
df = pd.DataFrame(X, columns = ['f1','f2','f3'])
df['y'] = data['y']

In [32]:
df.head()

Unnamed: 0,f1,f2,f3,y
0,-0.423126,-1.555602,0.181651,1.0
1,-2.520394,-0.51729,-0.200648,1.0
2,-0.002139,0.30002,-1.567659,0.0
3,0.726209,1.36593,-1.338565,0.0
4,-1.599662,-0.892703,-1.072608,0.0


In [31]:
df.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [33]:
df.std()

f1    1.002509
f2    1.002509
f3    1.002509
y     0.501255
dtype: float64

### LR

In [19]:
clf = SGDClassifier(loss = 'log',random_state = 13)

In [20]:
clf.fit(X,Y)

SGDClassifier(loss='log', random_state=13)

In [21]:
weights = clf.coef_

In [22]:
# Getting the weight coeffcient corresponding to each feature
coef_dict = {}
for coef, feature in zip(weights[0,:],['f1','f2','f3']): 
    coef_dict[feature] = coef

In [23]:
coef_dict

{'f1': 1.0436349636193234, 'f2': 0.3949878051175316, 'f3': 11.887971294111992}

### SVM

In [24]:
clf = SGDClassifier(loss = 'hinge',random_state = 13)

In [25]:
clf.fit(X,Y)

SGDClassifier(random_state=13)

In [26]:
weights = clf.coef_

In [27]:
# Getting the weight coeffcient corresponding to each feature
coef_dict = {}
for coef, feat in zip(weights[0,:],['f1','f2','f3']): 
    coef_dict[feat] = coef

In [28]:
coef_dict

{'f1': 2.7334374733067524, 'f2': 2.4596051013617726, 'f3': 15.269641130615158}

### Observations -

1. If one feature has very large values, it will dominate over other features when calculating the distance. So Standardization gives all features the same influence on the distance metric.
2. By applying standardization, we have eliminated the effect of SD on features by making it equal for all features.
3. Thus, feature f3 which has more correlation with y has more weightage. This was not the case for Task 1.
4. Both LR and SVM have the same pattern in weights : f3 > f1> f2.