In [1]:
# Importing Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Automobile_data.csv
columns = ['Frequency','AOA','ChordLength','FreeStreamVelocity','SuctionThickness','SoundPressureLevel']
df = pd.read_csv('airfoil_self_noise.dat',names=columns, sep='\t')
df.shape

(1503, 6)

In [3]:
df.head()

Unnamed: 0,Frequency,AOA,ChordLength,FreeStreamVelocity,SuctionThickness,SoundPressureLevel
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [4]:
df.isnull().sum()

Frequency             0
AOA                   0
ChordLength           0
FreeStreamVelocity    0
SuctionThickness      0
SoundPressureLevel    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Frequency           1503 non-null   int64  
 1   AOA                 1503 non-null   float64
 2   ChordLength         1503 non-null   float64
 3   FreeStreamVelocity  1503 non-null   float64
 4   SuctionThickness    1503 non-null   float64
 5   SoundPressureLevel  1503 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 70.6 KB


In [6]:
# train test split
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size=0.7, random_state=100)
print(df_train.shape)
print(df_test.shape)

(1052, 6)
(451, 6)


In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [8]:
df_train[df.columns.tolist()] = scaler.fit_transform(df_train[df.columns.tolist()])
df_train.head()

Unnamed: 0,Frequency,AOA,ChordLength,FreeStreamVelocity,SuctionThickness,SoundPressureLevel
355,0.116162,0.18018,0.727273,0.0,0.080847,0.460589
625,0.242424,0.324324,0.454545,0.199495,0.149819,0.27117
1478,0.021717,0.702703,0.272727,1.0,0.74685,0.517489
637,0.05303,0.445946,0.454545,1.0,0.325792,0.52819
487,0.040404,0.0,0.454545,0.199495,0.026412,0.72946


In [9]:
y = df_train.pop('SoundPressureLevel')
X = df_train

In [10]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [11]:
lm = LinearRegression()
lm.fit(X, y)
rfe = RFE(lm, 5)
rfe = rfe.fit(X, y)

In [12]:
list(zip(X.columns,rfe.support_,rfe.ranking_))

[('Frequency', True, 1),
 ('AOA', True, 1),
 ('ChordLength', True, 1),
 ('FreeStreamVelocity', True, 1),
 ('SuctionThickness', True, 1)]

In [13]:
col = X.columns[rfe.support_]
col

Index(['Frequency', 'AOA', 'ChordLength', 'FreeStreamVelocity',
       'SuctionThickness'],
      dtype='object')

In [14]:
X.columns[~rfe.support_]

Index([], dtype='object')

In [15]:
X_train_rfe = X[col]

In [16]:
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [17]:
lm = sm.OLS(y,X_train_rfe).fit()

In [18]:
#Let's see the summary of our linear model
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:     SoundPressureLevel   R-squared:                       0.513
Model:                            OLS   Adj. R-squared:                  0.511
Method:                 Least Squares   F-statistic:                     220.4
Date:                Fri, 22 Oct 2021   Prob (F-statistic):          1.28e-160
Time:                        22:11:57   Log-Likelihood:                 652.24
No. Observations:                1052   AIC:                            -1292.
Df Residuals:                    1046   BIC:                            -1263.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  0.8542      0

In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X_train_rfe = X_train_rfe.drop(['const'], axis=1)
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
1,AOA,4.79
4,SuctionThickness,4.1
3,FreeStreamVelocity,2.4
2,ChordLength,1.71
0,Frequency,1.58
