## Feature selection and modeling for FiveThirtyEight College Majors Dataset

https://github.com/fivethirtyeight/data/blob/master/college-majors/recent-grads.csv

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('/content/recent-grads.csv')

In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  173 non-null    int64  
 1   Major_code            173 non-null    int64  
 2   Major                 173 non-null    object 
 3   Total                 172 non-null    float64
 4   Men                   172 non-null    float64
 5   Women                 172 non-null    float64
 6   Major_category        173 non-null    object 
 7   ShareWomen            172 non-null    float64
 8   Sample_size           173 non-null    int64  
 9   Employed              173 non-null    int64  
 10  Full_time             173 non-null    int64  
 11  Part_time             173 non-null    int64  
 12  Full_time_year_round  173 non-null    int64  
 13  Unemployed            173 non-null    int64  
 14  Unemployment_rate     173 non-null    float64
 15  Median                1

In [None]:
# Look at how many missing values are in this dataset:
print("Missing Values:")
print(data.isnull().sum())

# The number of missing values is very small in each feature, so we do not need
# to worrying about removing features containing to many NAs. Instead, we directly
# dropped those few NA values which won't affect the general result
data_cleaned = data.dropna()

print("Cleaned Dataset:")
print(data_cleaned.head())

print("Summary Statistics of Cleaned Data:")
print(data_cleaned.describe())

Missing Values:
Rank                    0
Major_code              0
Major                   0
Total                   1
Men                     1
Women                   1
Major_category          0
ShareWomen              1
Sample_size             0
Employed                0
Full_time               0
Part_time               0
Full_time_year_round    0
Unemployed              0
Unemployment_rate       0
Median                  0
P25th                   0
P75th                   0
College_jobs            0
Non_college_jobs        0
Low_wage_jobs           0
dtype: int64
Cleaned Dataset:
   Rank  Major_code                                      Major    Total  \
0     1        2419                      PETROLEUM ENGINEERING   2339.0   
1     2        2416             MINING AND MINERAL ENGINEERING    756.0   
2     3        2415                  METALLURGICAL ENGINEERING    856.0   
3     4        2417  NAVAL ARCHITECTURE AND MARINE ENGINEERING   1258.0   
4     5        2405              

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import numpy as np

In [None]:
data_median = data_cleaned.drop(['Rank', 'Major_code', 'Major', 'Major_category', 'P25th','P75th'], axis=1)
data_25th = data_cleaned.drop(['Rank', 'Major_code', 'Major', 'Major_category', 'Median','P75th'], axis=1)
data_75th = data_cleaned.drop(['Rank', 'Major_code', 'Major', 'Major_category', 'P25th','Median'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
y_med = data_median.Median
y_25th = data_25th.P25th
y_75th = data_75th.P75th
X_med_train, X_med_test, y_med_train, y_med_test = train_test_split(data_median, y_med, test_size=0.25, random_state=42)
X_25_train, X_25_test, y_25_train, y_25_test = train_test_split(data_25th, y_25th, test_size=0.25, random_state=42)
X_75_train, X_75_test, y_75_train, y_75_test = train_test_split(data_75th, y_75th, test_size=0.25, random_state=42)

Using multipvariable linear regression as the estimator to pick the top 5 features for median:

In [None]:
model_selected_features = LinearRegression()
model_selected_features.fit(X_med_train, y_med_train)

In [None]:
sfs = SFS(model_selected_features, k_features = 5, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1)
sfs = sfs.fit(X_med_train, y_med_train)

In [None]:
print("The top 5 features to predict median salary using linear regression are:")
print(X_med_train.columns[list(sfs.k_feature_idx_)])

The top 5 features to predict median salary using linear regression are:
Index(['ShareWomen', 'Sample_size', 'Unemployed', 'Unemployment_rate',
       'Median'],
      dtype='object')


Using multipvariable linear regression as the estimator to pick the top 5 features for the 25th percentile:

In [None]:
model_selected_features = LinearRegression()
model_selected_features.fit(X_25_train, y_25_train)
sfs = SFS(model_selected_features, k_features = 5, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1)
sfs = sfs.fit(X_25_train, y_25_train)
print("The top 5 features to predict the 25th percentile salary using linear regression are:")
print(X_med_train.columns[list(sfs.k_feature_idx_)])

The top 5 features to predict the 25th percentile salary using linear regression are:
Index(['Full_time_year_round', 'Median', 'College_jobs', 'Non_college_jobs',
       'Low_wage_jobs'],
      dtype='object')


Using multipvariable linear regression as the estimator to pick the top 5 features for the 75th percentile:




In [None]:
model_selected_features = LinearRegression()
model_selected_features.fit(X_75_train, y_75_train)
sfs = SFS(model_selected_features, k_features = 5, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1)
sfs = sfs.fit(X_75_train, y_75_train)
print("The top 5 features to predict the 75th percentile salary using linear regression are:")
print(X_med_train.columns[list(sfs.k_feature_idx_)])

The top 5 features to predict the 75th percentile salary using linear regression are:
Index(['Employed', 'Full_time', 'Full_time_year_round', 'Median',
       'Low_wage_jobs'],
      dtype='object')


Using the knn algorithm as the estimator to pick the top 5 features for median:

In [None]:
# If using KNN as the estimator (because we cannot guarantee linear relationship)
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
sfs = SFS(knn, k_features = 5, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1)
sfs = sfs.fit(X_med_train, y_med_train)
print("The top 5 features to predict median salary using knn are:")
print(X_med_train.columns[list(sfs.k_feature_idx_)])

The top 5 features to predict median salary using knn are:
Index(['ShareWomen', 'Sample_size', 'Unemployed', 'Unemployment_rate',
       'Median'],
      dtype='object')


In [None]:
sfs = SFS(knn, k_features = 5, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1)
sfs = sfs.fit(X_25_train, y_25_train)
print("The top 5 features to predict the 25th percentile salary using knn are:")
print(X_25_train.columns[list(sfs.k_feature_idx_)])

The top 5 features to predict the 25th percentile salary using knn are:
Index(['ShareWomen', 'Sample_size', 'Unemployed', 'Unemployment_rate',
       'P25th'],
      dtype='object')


In [None]:
sfs = SFS(knn, k_features = 5, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1)
sfs = sfs.fit(X_75_train, y_75_train)
print("The top 5 features to predict the 75th percentile salary using knn are:")
print(X_75_train.columns[list(sfs.k_feature_idx_)])

The top 5 features to predict the 75th percentile salary using knn are:
Index(['ShareWomen', 'Sample_size', 'Unemployed', 'Unemployment_rate',
       'P75th'],
      dtype='object')
