In [None]:
#### SVM Regression

#### References:
http://cs.adelaide.edu.au/~chhshen/teaching/ML_SVR.pdf
kernelsvm.tripod.com

In [5]:
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt

In [127]:
# Fit regression model

'''Create an object that holds SVR model object and the name of the model used.'''
class Model_NAME():
    
    def __init__(self, svr_obj, name):
        self.svr_model = svr_obj
        self.name = name

'''Evaluates three forms of SVR models on X and Y passed as param and returns a list of R^2 defined as:
The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of
squares ((y_true - y_pred) ** 2).sum() and v is the residual
sum of squares ((y_true - y_true.mean()) ** 2).sum(). 
Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). 
A constant model that always predicts the expected value of y, 
disregarding the input features, would get a R^2 score of 0.0.'''

def benchmark_SVR(list_models, x, y, n_samples=100):
    X = x[0:n_samples]
    Y = y[0:n_samples]
    print("X.shape, Y.shape "), X.shape, Y.shape
    scores_list = []
    for model_name in list_models:
        svr_model = model_name.svr_model.fit(X,Y)        
        row = [model_name.name, svr_model.score(X,Y)]
        scores_list.append(row)
    print("Completed benchmark_svr \n")
    return scores_list

In [63]:
#Read CSV
import pandas as pd
import numpy as np
import os
from pandas import DataFrame
import numpy
import sklearn
import scipy as sp

In [122]:
app_file = '../data/big-data-csv.csv'
appdf = pd.read_csv(app_file,sep=',')
appdf.head(5)

Unnamed: 0.1,Unnamed: 0,Category,Score,Description,Price,PublicationDate,AppSize,Name,ContentRating,LastUpdateDate,Instalations,IsTopDeveloper,HaveInAppPurchases,IsFree,Developer
0,0,NEWS_AND_MAGAZINES,5.0,Read the most popular newspapers from Sweden ...,0,2015-07-08T03:00:00.000Z,2.9,Sweden News,Everyone 10+,2015-07-08T03:00:00.000Z,50 - 100,False,False,True,News Now
1,1,MEDIA_AND_VIDEO,2.882353,Sweden Tv channels guide. Tv Sweden include lo...,0,2015-07-25T03:00:00.000Z,2.8,Tv Sweden,Everyone,2015-07-25T03:00:00.000Z,"5,000 - 10,000",False,False,True,QSC
2,2,ENTERTAINMENT,4.133333,Enjoy the best radios of Sweden.,0,2015-07-18T03:00:00.000Z,12.0,Radio Sweden,Everyone,2015-07-18T03:00:00.000Z,"1,000 - 5,000",False,False,True,User One Studio
3,3,FINANCE,3.633564,"In the Mobile bank app, you as a private Norde...",0,2015-07-06T03:00:00.000Z,3.1,Nordea Mobile Bank – Sweden,Everyone,2015-07-06T03:00:00.000Z,"500,000 - 1,000,000",False,False,True,Nordea Bank AB
4,4,MUSIC_AND_AUDIO,3.714286,Listen Sweden radio stations on your mobile.Fo...,0,2014-05-20T03:00:00.000Z,0.72168,RADIO SWEDEN,Unrated,2014-05-20T03:00:00.000Z,"1,000 - 5,000",False,False,True,MoolApps


In [165]:

#Initialize the parameters for SVR models. We iteratively go on finding optimal epsilon and gamma values.
C = 1e3
gamma = 0.1
degree = [2, 3]
epsilon = 0.2
max_iter = 2
#create model objects  of different SVR models.
svr_rbf = SVR(kernel = 'rbf', C=C, epsilon = epsilon, gamma=gamma, max_iter = max_iter)
svr_lin = SVR(kernel = 'linear', C=C, epsilon = epsilon)
svr_poly2 = SVR(kernel = 'poly', C=C, epsilon = epsilon, degree=degree[0])
svr_poly3 = SVR(kernel = 'poly', C=C, epsilon = epsilon, degree=degree[1])
svr_sigmoid = SVR(kernel = 'sigmoid', C=C, epsilon = epsilon)

##Create objects of model_name class using above models.
mnsvr_rbf = Model_NAME(svr_rbf, "rbf")
mnsvr_linear = Model_NAME(svr_lin, "linear")
mnsvr_poly2 = Model_NAME(svr_poly2, "poly-deg-2")
mnsvr_poly3 = Model_NAME(svr_poly3, "poly-deg-3")
mnsvr_sigmoid = Model_NAME(svr_sigmoid, "sigmoid-3")

list_models = [mnsvr_rbf, mnsvr_linear, mnsvr_poly2, mnsvr_poly3, mnsvr_sigmoid]

In [167]:
n_samples, n_features = 1000, 1

##Initialize your regressors and targets here. Your Y must be an a ndarray with shape (n_samples,)

#1. Generate X with all the features available. Generate a target as Y.

#2. Specifically, divide training features into different divisions consisting of 
# different features. As below:

    #a) X contain ALL the features
    #b) X contain HIGHLY UNCORRELATED FEATURES FOR target variable  Y.
    #c) X contain some features without Category.
    #d) X contain some features without Description.
    #e) X contain Category only
    #f) X contain Description only. 

#3. For all of the above X, generate Training Set and Testing Sets of X and Y. Modify benchmark_SVR to return proper record. 
#4. Plot the Graph: Type_of_Data_Set_IN_X vs. R^2_of Different Models

X = appdf.iloc[0:,[2]].values #for example
Y = appdf.iloc[0:,6].values#for example
'''#Mind this is a test data.
    #For Example:
    np.random.seed(0)
    Y = np.random.randn(n_samples)
    X = np.random.randn(n_samples, n_features)

'''

#Pass training set and testing set of X and Y
results_list = benchmark_SVR(list_models, X, Y, n_samples)


X.shape, Y.shape  (1000, 1) (1000,)
Completed benchmark_svr 

	rbf 	-17.6522378941
	linear 	-0.0408400411665
	poly-deg-2 	-0.0400492604531
	poly-deg-3 	-0.0955169928975
	sigmoid-3 	-0.043909441743
['rbf', 'linear', 'poly-deg-2', 'poly-deg-3', 'sigmoid-3']
[-17.652237894109881, -0.040840041166532171, -0.040049260453120628, -0.095516992897484387, -0.043909441742989941]


In [178]:
model_names = []
score_values = []
for row in results_list:
    print("Score using %s  = %.2f" % (row[0],row[1] ))
    model_names.extend([row[0]])
    score_values.extend([row[1]])
print 
print model_names
print
print score_values

#plot model_names vs score_values

Score using rbf  = -17.65
Score using linear  = -0.04
Score using poly-deg-2  = -0.04
Score using poly-deg-3  = -0.10
Score using sigmoid-3  = -0.04

['rbf', 'linear', 'poly-deg-2', 'poly-deg-3', 'sigmoid-3']

[-17.652237894109881, -0.040840041166532171, -0.040049260453120628, -0.095516992897484387, -0.043909441742989941]


In [179]:
def bench_markRidgeLassorRLasso(estimator_list, x_train, y_train, x_test, y_test):
    X_trn, Y_trn = x_train, y_train 
    X_tst, Y_tst = x_test, y_test
    score_list = []
    for estimator in estimator_list:
        estimator.fit(x, y)
        score_trm = estimator.score_
        y_predicted = estimator.predict(X_tst)
        
    
    
lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)
 
ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)
 
 
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)
 
 
rlasso = RandomizedLasso(alpha=0.04)
rlasso.fit(X, Y)
ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

Features sorted by their score:
[(1.0, 'RM'), (1.0, 'PTRATIO'), (1.0, 'LSTAT'), (0.63, 'CHAS'), (0.63, 'B'), (0.37, 'CRIM'), (0.345, 'TAX'), (0.215, 'NOX'), (0.215, 'DIS'), (0.13, 'INDUS'), (0.045, 'ZN'), (0.02, 'RAD'), (0.01, 'AGE')]
