In [19]:
from sklearn.model_selection import train_test_split,GridSearchCV
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline

In [20]:
#Loads the processed dataset
data=pd.read_csv("Final.csv")

In [21]:
#Separates the independent variables and dependent variable into x and y
columns = [x for x in data.columns if x not in ['Facebook_scaled','LinkedIn_scaled','GooglePlus_scaled']]
x = data[columns]
y = data[['Facebook_scaled']]

#Splits the dataset into 70 percent training set and 30 percent testing set
# keeping the random_state value same makes the function to split same rows into training and testing every time
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=30)

In [22]:
#Creates a pipeline lasso model for variable selection
pipeline = Pipeline([('model',Lasso())])

In [23]:
from datetime import timedelta
import time
#Time returns the elapsed time in seconds. This function converts that into hours, minutes and seconds
def convert_to_preferred_format(sec):
   sec = sec % (24 * 3600)
   hour = sec // 3600
   sec %= 3600
   min = sec // 60
   sec %= 60
   return "%02d:%02d:%02d" % (hour, min, sec) 
start = time.time()
#Run a grid search on the lasso model to find the best attributes
sea = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=3
                      )
sea.fit(x_train,y_train)
sea.best_params_
coefficients = sea.best_estimator_.named_steps['model'].coef_
imp = np.abs(coefficients)
stop = (time.time() - start)
print('Elapsed Time:', convert_to_preferred_format(stop))
print('======'*5)

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 1/5] END .................model__alpha=0.1;, score=-0.727 total time=   1.1s
[CV 2/5] END .................model__alpha=0.1;, score=-0.679 total time=   1.0s
[CV 3/5] END .................model__alpha=0.1;, score=-0.742 total time=   1.3s
[CV 4/5] END .................model__alpha=0.1;, score=-0.827 total time=   1.1s
[CV 5/5] END .................model__alpha=0.1;, score=-0.708 total time=   1.0s
[CV 1/5] END .................model__alpha=0.2;, score=-0.775 total time=   1.0s
[CV 2/5] END .................model__alpha=0.2;, score=-0.719 total time=   1.1s
[CV 3/5] END .................model__alpha=0.2;, score=-0.793 total time=   1.0s
[CV 4/5] END .................model__alpha=0.2;, score=-0.908 total time=   1.0s
[CV 5/5] END .................model__alpha=0.2;, score=-0.752 total time=   1.0s
[CV 1/5] END .model__alpha=0.30000000000000004;, score=-0.775 total time=   1.0s
[CV 2/5] END .model__alpha=0.30000000000000004;

In [24]:
#Attributes with 0 importance does not influence the output prediction
features=x.loc[:,importance>0]

In [25]:
#Save the lasso variable selection attributes
features.to_csv('variable_selection.csv',index=False)
y.to_csv('variable_labels.csv',index=False)