In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
import sklearn
print(sklearn.__version__)

1.3.2


In [3]:
full_data = pd.read_csv('merged_dataset')
full_data = full_data.drop(columns=['Unnamed: 0', 'Batch', 'Date'])
full_data.head()

Unnamed: 0,Weight,NV,Visc-Ford,BPA_level,Amine,Max_Temp_Degrees,Rain_Inches
0,8.46,20.97,16.2,36.49,0.0089,56.0,0.18
1,8.43,20.91,15.5,36.49,0.0089,43.0,0.12
2,8.46,20.91,15.7,36.49,0.0089,43.0,0.12
3,8.44,20.81,17.0,36.49,0.0089,41.0,0.01
4,8.44,20.67,15.9,36.49,0.0089,41.0,0.01


# Pipeline

## Recursive Feature Estimator

The dataset does not have many potential features, but it would be helpful to know if any of the features in the set are unlikely to influence the predictive performance of selected models. RFE will rank the dataset's features to help identify which features are most likely to influence model performance. 

## Standard Scaler

Because the data is replete with unique units (weights, temperatures, and inches), a scaler will be utilized to assist ML algorithms in interpreting the magnitudes of the data.

## Polynomial Features

From the visualizations in the previous notebook, the relationships between the potential features and the target do not appear to be linear, we will apply polynomial features to attempt to more accurately capture the relationship between variables.

In [4]:
#Define features and target
X = full_data[['Weight', 'NV', 'Amine', 'BPA_level', 'Max_Temp_Degrees', 'Rain_Inches']]
y = full_data['Visc-Ford']

#Define estimator
estimator = LinearRegression()

#Create pipeline
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', RFE(estimator, n_features_to_select=7))
])

#Fit pipeline
pipeline.fit(X,y)

#Extract selector from pipeline
selector = pipeline.named_steps['selector']

#Get ranking
ranking = selector.ranking_

#Generate polynomial feature names
poly = pipeline.named_steps['poly']
poly_feature_names = poly.get_feature_names_out(X.columns)

#Create a DataFrame to see features
feature_ranking = pd.DataFrame({'Feature': poly_feature_names, 'Ranking': ranking})

#Sort by ranking to see most important features first
feature_ranking_sorted = feature_ranking.sort_values(by='Ranking')
print(feature_ranking_sorted)

                         Feature  Ranking
13                      NV Amine        1
23         BPA_level Rain_Inches        1
22    BPA_level Max_Temp_Degrees        1
20             Amine Rain_Inches        1
8                   Weight Amine        1
19        Amine Max_Temp_Degrees        1
14                  NV BPA_level        1
9               Weight BPA_level        2
18               Amine BPA_level        3
17                       Amine^2        4
2                          Amine        5
3                      BPA_level        6
21                   BPA_level^2        7
1                             NV        8
0                         Weight        9
6                       Weight^2       10
7                      Weight NV       11
4               Max_Temp_Degrees       12
10       Weight Max_Temp_Degrees       13
15           NV Max_Temp_Degrees       14
11            Weight Rain_Inches       15
16                NV Rain_Inches       16
12                          NV^2  

Anything with a ranking over 3 is not likely to influence model performance. Below, I will filter the results to create 2 datasets for the model:

•The first will only keep features whose rankings are 3 or lower.
•The second will keep features whose rankings are 10 or lower.

In [5]:
#Select features with ranking <=3:
features_3 = feature_ranking[feature_ranking['Ranking'] <=3]['Feature']
features_3_list = features_3.tolist()

#Select features with ranking <=10:
features_10 = feature_ranking[feature_ranking['Ranking'] <=10]['Feature']
features_10_list = features_10.tolist()

#Create new dataframes with selected features
#Since PolynomialFeatures were applied, the original data needs to be transformed
poly_features = poly.transform(X)
poly_feature_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=X.index)

#DataFrame with feature rankings <=3
feature_3 = poly_feature_df[features_3_list]
feature_3['Visc-Ford'] = y

#DataFrame with feature rankings <=10
feature_10 = poly_feature_df[features_10_list]
feature_10['Visc-Ford'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_3['Visc-Ford'] = y
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_10['Visc-Ford'] = y


In [6]:
feature_3.head()

Unnamed: 0,Weight Amine,Weight BPA_level,NV Amine,NV BPA_level,Amine BPA_level,Amine Max_Temp_Degrees,Amine Rain_Inches,BPA_level Max_Temp_Degrees,BPA_level Rain_Inches,Visc-Ford
0,0.075294,308.7054,0.186633,765.1953,0.324761,0.4984,0.001602,2043.44,6.5682,16.2
1,0.075027,307.6107,0.186099,763.0059,0.324761,0.3827,0.001068,1569.07,4.3788,15.5
2,0.075294,308.7054,0.186099,763.0059,0.324761,0.3827,0.001068,1569.07,4.3788,15.7
3,0.075116,307.9756,0.185209,759.3569,0.324761,0.3649,8.9e-05,1496.09,0.3649,17.0
4,0.075116,307.9756,0.183963,754.2483,0.324761,0.3649,8.9e-05,1496.09,0.3649,15.9


In [7]:
feature_10.head()

Unnamed: 0,Weight,NV,Amine,BPA_level,Weight^2,Weight Amine,Weight BPA_level,NV Amine,NV BPA_level,Amine^2,Amine BPA_level,Amine Max_Temp_Degrees,Amine Rain_Inches,BPA_level^2,BPA_level Max_Temp_Degrees,BPA_level Rain_Inches,Visc-Ford
0,8.46,20.97,0.0089,36.49,71.5716,0.075294,308.7054,0.186633,765.1953,7.9e-05,0.324761,0.4984,0.001602,1331.5201,2043.44,6.5682,16.2
1,8.43,20.91,0.0089,36.49,71.0649,0.075027,307.6107,0.186099,763.0059,7.9e-05,0.324761,0.3827,0.001068,1331.5201,1569.07,4.3788,15.5
2,8.46,20.91,0.0089,36.49,71.5716,0.075294,308.7054,0.186099,763.0059,7.9e-05,0.324761,0.3827,0.001068,1331.5201,1569.07,4.3788,15.7
3,8.44,20.81,0.0089,36.49,71.2336,0.075116,307.9756,0.185209,759.3569,7.9e-05,0.324761,0.3649,8.9e-05,1331.5201,1496.09,0.3649,17.0
4,8.44,20.67,0.0089,36.49,71.2336,0.075116,307.9756,0.183963,754.2483,7.9e-05,0.324761,0.3649,8.9e-05,1331.5201,1496.09,0.3649,15.9


In [8]:
#Save as .csvs for next step
feature_3.to_csv('feature_3.csv')
feature_10.to_csv('feature_10.csv')