# Feature Selection Notebook

This notebook contains information on feature selection apart from the EDA. We want to use Lasso regression to find these best features to input into our models. 

In [21]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

In [9]:
df=pd.read_csv('clean_salary_data.csv')
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply,simplified_title,seniority,salary_range,min_salary,max_salary,avg_salary,company_age,company_name,state,city,headquarters_state,same_location,size_range,min_size,max_size,avg_size,python,sql,excel,R,deep_learning,PhD,bachelor,masters,power_bi,tableau,prob_solver,critical_thinker
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice\n3.2,"New York, NY","New York, NY",201 to 500 employees,1961,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD),-1,True,data analyst,na,$37-$66,37,66,51.5,60,Vera Institute of Justice,NY,New York,NY,1,201 to 500,201,500,350.5,1,1,0,1,0,0,1,0,0,0,0,0
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York\n3.8,"New York, NY","New York, NY",10000+ employees,1893,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,-1,data analyst,na,$37-$66,37,66,51.5,128,Visiting Nurse Service of New York,NY,New York,NY,1,10000+,10000,30000,20000.0,0,1,1,1,0,0,1,1,0,0,0,0
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace\n3.4,"New York, NY","New York, NY",1001 to 5000 employees,2003,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,GoDaddy,-1,data analyst,senior,$37-$66,37,66,51.5,18,Squarespace,NY,New York,NY,1,1001 to 5000,1001,5000,3000.5,1,1,1,1,0,1,1,0,0,1,0,0
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity\n4.1,"New York, NY","McLean, VA",201 to 500 employees,2002,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD),-1,-1,data analyst,na,$37-$66,37,66,51.5,19,Celerity,NY,New York,VA,0,201 to 500,201,500,350.5,0,1,0,1,0,0,1,0,0,1,0,0
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel\n3.9,"New York, NY","New York, NY",501 to 1000 employees,2009,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD),DraftKings,True,data analyst,na,$37-$66,37,66,51.5,12,FanDuel,NY,New York,NY,1,501 to 1000,501,1000,750.5,1,1,1,1,0,0,1,0,0,0,0,0


In [15]:
#potential features to analyze based on information from the EDA Notebook 
features=['Rating', 'Type of ownership', 'Sector', 'Revenue', 'simplified_title', 'seniority', 'company_age',
       'state', 'same_location', 'python', 'sql', 'excel',
       'deep_learning', 'PhD', 'bachelor', 'masters', 'power_bi', 'tableau', 'avg_salary']
#create the data frame needed for the modeling 
df_inter=df[features]
df_inter.head()

Unnamed: 0,Rating,Type of ownership,Sector,Revenue,simplified_title,seniority,company_age,state,same_location,python,sql,excel,deep_learning,PhD,bachelor,masters,power_bi,tableau,avg_salary
0,3.2,Nonprofit Organization,Non-Profit,$100 to $500 million (USD),data analyst,na,60,NY,1,1,1,0,0,0,1,0,0,0,51.5
1,3.8,Nonprofit Organization,Health Care,$2 to $5 billion (USD),data analyst,na,128,NY,1,0,1,1,0,0,1,1,0,0,51.5
2,3.4,Company - Private,Information Technology,Unknown / Non-Applicable,data analyst,senior,18,NY,1,1,1,1,0,1,1,0,0,1,51.5
3,4.1,Subsidiary or Business Segment,Information Technology,$50 to $100 million (USD),data analyst,na,19,NY,0,0,1,0,0,0,1,0,0,1,51.5
4,3.9,Company - Private,"Arts, Entertainment & Recreation",$100 to $500 million (USD),data analyst,na,12,NY,1,1,1,1,0,0,1,0,0,0,51.5


In [16]:
#one hot endode the variables 
df_dummies=pd.get_dummies(df_inter)
df_dummies.head()
#very sparse matrix to get what we need, maybe an ensemble method would work here. 

Unnamed: 0,Rating,company_age,same_location,python,sql,excel,deep_learning,PhD,bachelor,masters,power_bi,tableau,avg_salary,Type of ownership_-1,Type of ownership_College / University,Type of ownership_Company - Private,Type of ownership_Company - Public,Type of ownership_Contract,Type of ownership_Franchise,Type of ownership_Government,Type of ownership_Hospital,Type of ownership_Nonprofit Organization,Type of ownership_Other Organization,Type of ownership_Private Practice / Firm,Type of ownership_School / School District,Type of ownership_Self-employed,Type of ownership_Subsidiary or Business Segment,Type of ownership_Unknown,Sector_-1,Sector_Accounting & Legal,Sector_Aerospace & Defense,"Sector_Arts, Entertainment & Recreation",Sector_Biotech & Pharmaceuticals,Sector_Business Services,"Sector_Construction, Repair & Maintenance",Sector_Consumer Services,Sector_Education,Sector_Finance,Sector_Government,Sector_Health Care,Sector_Information Technology,Sector_Insurance,Sector_Manufacturing,Sector_Media,Sector_Mining & Metals,Sector_Non-Profit,"Sector_Oil, Gas, Energy & Utilities",Sector_Real Estate,"Sector_Restaurants, Bars & Food Services",Sector_Retail,Sector_Telecommunications,Sector_Transportation & Logistics,Sector_Travel & Tourism,Revenue_$1 to $2 billion (USD),Revenue_$1 to $5 million (USD),Revenue_$10 to $25 million (USD),Revenue_$10+ billion (USD),Revenue_$100 to $500 million (USD),Revenue_$2 to $5 billion (USD),Revenue_$25 to $50 million (USD),Revenue_$5 to $10 billion (USD),Revenue_$5 to $10 million (USD),Revenue_$50 to $100 million (USD),Revenue_$500 million to $1 billion (USD),Revenue_-1,Revenue_Less than $1 million (USD),Revenue_Unknown / Non-Applicable,simplified_title_business analyst,simplified_title_data analyst,simplified_title_data engineer,simplified_title_data management,simplified_title_data scientist,simplified_title_data security analyst,simplified_title_data warehouse enginner,simplified_title_other,simplified_title_risk analyst,seniority_jr,seniority_na,seniority_senior,state_ AZ,state_ CA,state_ CO,state_ DE,state_ FL,state_ GA,state_ IL,state_ IN,state_ KS,state_ NC,state_ NJ,state_ NY,state_ OH,state_ PA,state_ SC,state_ TX,state_ UT,state_ VA,state_ WA
0,3.2,60,1,1,1,0,0,0,1,0,0,0,51.5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,3.8,128,1,0,1,1,0,0,1,1,0,0,51.5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,3.4,18,1,1,1,1,0,1,1,0,0,1,51.5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,4.1,19,0,0,1,0,0,0,1,0,0,1,51.5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,3.9,12,1,1,1,1,0,0,1,0,0,0,51.5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [19]:
#Create the feature matrix, target variable, and train test splits 
X=df_dummies.drop('avg_salary', axis=1)
y=df_dummies.avg_salary

X_train, X_val, y_train, y_val=train_test_split(X, y, test_size=0.30, random_state=42)

### Use a Lasso Regression to find feature importance

In [22]:
#create a pipeline to scale the data beforehand
pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
])

In [27]:
#lets find the optimal alpha for the lasso model. We will test values of alpha from .1 to 10 in .1 increments. 
#Moreover, we will use 5 fold cross validation and select the value of alpha that minimizes the MSE 
grid_search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error"
                      )

In [28]:
#train the grid search 
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2,
       5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5,
       6.6, 6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8,
       7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9. , 9.1,
       9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9])},
             scoring='neg_mean_squared_error')

In [30]:
#Find the best alpha value 
print("The best alpha value is ", grid_search.best_params_)

The best alpha value is  {'model__alpha': 0.6}


In [36]:
#We can now gather the coefficients
coefficients = grid_search.best_estimator_.named_steps['model'].coef_
np.array(coefficients)

array([ 0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  3.21712668e-01,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -1.09775876e-01,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  2.11778077e-01, -2.62807368e-01,
       -0.00000000e+00, -1.21056931e-01,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  1.55098389e-01,  3.79817488e-01,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -2.20682360e-01,
       -5.78213155e-01, -0.00000000e+00,  2.90871417e-02,  0.00000000e+00,
       -2.94258665e-01, -0.00000000e+00, -6.04364497e-01,  0.00000000e+00,
       -3.18804284e-01, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -1.83246603e-01,  1.81312890e-01, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [40]:
#We can see the features that were the most important to determining the 
#average salary. Let's take this into consideration. 
list(np.array(X_train.columns)[np.array(coefficients) > 0])

['python',
 'Type of ownership_Private Practice / Firm',
 'Sector_Arts, Entertainment & Recreation',
 'Sector_Biotech & Pharmaceuticals',
 'Sector_Health Care',
 'Sector_Telecommunications',
 'Revenue_$1 to $5 million (USD)',
 'Revenue_$25 to $50 million (USD)',
 'Revenue_$5 to $10 billion (USD)',
 'simplified_title_data warehouse enginner',
 'seniority_senior',
 'state_ CA',
 'state_ CO',
 'state_ IL']