In [1]:
#f_regression: Used only for numeric targets and based on linear regression performance.#

#f_classif: Used only for categorical targets and based on the Analysis of Variance (ANOVA) statistical test.

#chi2: Performs the chi-square statistic for categorical targets, which is less sensible to the nonlinear relationship between the predictive variable and its target.


In [2]:
import statsmodels.api as sm
from sklearn import datasets
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = datasets.load_boston()
X = pd.DataFrame(data.data,columns=data.feature_names)
y = pd.DataFrame(data.target, columns=["MEDV"])

In [4]:
import matplotlib.pyplot as plt
from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, f_classif,f_regression
# #############################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 10% most significant features
X_indices = np.arange(X.shape[-1])  ## number of column
selector = SelectPercentile(f_regression, percentile=25) # fro classification SelectPercentile(f_classif, percentile=25)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
        edgecolor='black')
plt.show()
print(pd.DataFrame(X.columns,selector.pvalues_))
print(scores)
#print(np.argsort(scores))

for n,s in zip(X.columns,selector.scores_):
 print ('F-score: %3.2ft for feature %s ' % (s,n))

In [5]:
#greedy search RFECV

#Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.


In [6]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
estimator = SVR(kernel="linear")
selector = RFECV(estimator, cv=10,scoring='mean_squared_error')
selector.fit(X, y)
print('Optimal number of features: %d' % selector.n_features_)
print (X.columns[selector.support_])

In [7]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score MSE")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.show()

In [8]:
#Feature selection using SelectFromModel and LassoCV

In [9]:
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

In [10]:
X.shape

In [11]:
# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]
print(n_features)
print(sfm.threshold)
# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 4:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    #print('X_transform',X_transform)
    n_features = X_transform.shape[1]
    #print('n_features',n_features)

# Plot the selected two features from X.
plt.title(
    "Features selected from Boston using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]

feature3 = X_transform[:, 2]
feature4 = X_transform[:, 3]


plt.plot(feature1, feature2, 'r.')
plt.plot(feature3, feature4, 'y*')
plt.xlabel("Feature number 1,3")
plt.ylabel("Feature number 2,4")
#plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()

In [12]:
X.shape

In [13]:
'''Assessing feature importance via random decision forests

    Let's train a forest of 10,000 trees on the Wine dataset.'''

In [14]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier # both option are here

In [15]:
X.columns[:]

In [16]:
lables = X.columns
forest = RandomForestRegressor(n_estimators=10000,random_state=0,n_jobs=1)
forest.fit(X,y)

In [17]:
imp_feat = forest.feature_importances_
indicate = np.argsort(imp_feat)[::-1]

for f in range(X.shape[1]):
    print('%2d) %-*s %f' %(f + 1,30,lables[f],imp_feat[indicate[f]]))

In [18]:
#PCA

In [19]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,LogisticRegression

In [20]:
# Apply Standarded 
sc = StandardScaler()
X_STD = sc.fit_transform(X)

pca = PCA(n_components=2)
pca.fit(X)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

print(pca.explained_variance_ratio_)  
#print (pca.singular_values_) in python-3 it will work


'''
# Apply PCA
pca = PCA(n_components=2)
lr= LinearRegression()
X_pca = pca.fit_transform(X_STD)
lr.fit(X_pca,y)
'''

In [21]:
principalDf.head()

In [22]:
finalDf = pd.concat([principalDf, y], axis=1)

In [23]:
finalDf.head()

In [24]:
# above steps said 2 var is need but how to know how many var are required , to Final do below steps
pca = PCA()
pca.fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
plt.show()

In [25]:
from sklearn.model_selection import train_test_split

# test_size: what proportion of original data is used for test set
train_x, test_x, train_y, test_y = train_test_split( X, y, test_size=1/7.0, random_state=1)

In [26]:
# Apply PCA
pca = PCA(n_components=2)
lr= LinearRegression()
X_pca_train = pca.fit_transform(train_x)
model=lr.fit(X_pca_train,train_y)

In [27]:
X_pca_test = pca.fit_transform(test_x)
predicated_test = model.predict(X_pca_test)

In [28]:
# Plot outputs
#plt.plot(test_y,  color='black')
plt.plot(predicated_test, color='blue', linewidth=1)

#plt.plot(predication,target)
plt.show()