In [9]:
import numpy as np
import pandas as pd

# 1. Prepare the input dataset

In [5]:
df = pd.read_csv("data/airfoil_self_noise.dat",sep='\t',header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 6 columns):
0    1503 non-null int64
1    1503 non-null float64
2    1503 non-null float64
3    1503 non-null float64
4    1503 non-null float64
5    1503 non-null float64
dtypes: float64(5), int64(1)
memory usage: 70.5 KB


## Separate data into Input (columns: 0,1,2,3,4) and target (columns: 5)

In [7]:
data= df[[0,1,2,3,4]].values

In [8]:
target= df[5].values

## Split data using sklearn package

In [10]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test= train_test_split(data,target, test_size= 0.33)

# 2. Feed the model:
## First try: Linear Regression

### First, fit the model

In [13]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)

### Then we know how good the model is

In [21]:
train_model_score = model.score(X_train,y_train)
print(f'accuracy for training model: {round(train_model_score*100,2)}%')

test_model_score = model.score(X_test,y_test)
print(f'accuracy for training model: {round(test_model_score*100,2)}%')

accuracy for training model: 53.13%
accuracy for training model: 47.42%


### To know how the prediction results:

In [24]:
predictions= model.predict(X_test)
predictions

array([120.30368002, 118.43334301, 124.13417394, 128.32653663,
       122.06953706, 127.05281772, 130.07667215, 126.29144629,
       132.20109171, 125.77815956, 121.68467717, 123.24372754,
       129.44139252, 114.20316292, 123.02206125, 122.78222363,
       125.89367606, 127.88394497, 125.5268709 , 128.39961378,
       116.07398391, 129.44192106, 121.2037382 , 123.10507393,
       120.42270701, 122.35547941, 121.85299992, 117.01618135,
       130.09074731, 115.90960479, 130.35868328, 119.10263541,
       119.25921182, 123.65134349, 120.53397201, 120.02009628,
       129.76968665, 129.56084911, 127.28394216, 124.9222671 ,
       125.10409788, 123.49809732, 123.99562461, 123.08216534,
       129.93391169, 127.28072359, 123.32397342, 126.6562558 ,
       130.56226377, 123.7586674 , 124.15806963, 122.62874255,
       126.53473874, 117.06044761, 122.62929569, 117.55483075,
       126.47012755, 129.25053569, 133.45215833, 121.50468136,
       116.07715084, 120.82857176, 121.12768311, 124.57

 ## Second try: Random Forest Regressor

### Steps, pretty much the same. Start from fit the model

In [27]:
from sklearn.ensemble import RandomForestRegressor
model2= RandomForestRegressor()
model2.fit(X_train,y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

### Know the model performance

In [29]:
train_model_score2 = model2.score(X_train,y_train)
print(f'accuracy for training model: {round(train_model_score2*100,2)}%')

test_model_score2 = model2.score(X_test,y_test)
print(f'accuracy for training model: {round(test_model_score2*100,2)}%')

accuracy for training model: 98.36%
accuracy for training model: 89.61%


### The prediction results

In [30]:
predictions2= model2.predict(X_test)
predictions2

array([116.2157, 113.3109, 123.0815, 130.4349, 122.0877, 130.2677,
       133.2583, 120.7033, 132.0876, 134.3473, 117.149 , 125.2804,
       133.9787, 113.5565, 127.6878, 120.3707, 131.3545, 129.1539,
       134.0156, 130.1712, 111.8757, 132.4347, 118.629 , 126.7589,
       125.9506, 116.0251, 127.8844, 115.242 , 131.3877, 128.0896,
       132.809 , 120.7074, 113.7831, 125.3405, 122.2952, 114.9075,
       133.7742, 131.7636, 126.6756, 120.4466, 125.0928, 131.1825,
       127.3204, 118.5449, 131.0831, 129.5919, 131.7519, 134.7132,
       133.0468, 124.6885, 127.0249, 125.0808, 127.174 , 115.6612,
       118.4732, 113.0569, 131.4709, 124.2526, 130.3448, 118.3405,
       112.9612, 117.5271, 122.5399, 121.7172, 128.221 , 123.8962,
       126.2154, 134.2357, 120.3285, 135.7454, 132.8556, 128.0533,
       133.8212, 121.2744, 119.4676, 117.0009, 130.2972, 119.0597,
       122.8977, 129.5645, 124.5265, 128.0345, 126.3717, 126.6322,
       124.9319, 117.3646, 134.7693, 127.4837, 120.8544, 126.3

# Conclusions, 
## There are multiple method to do the regression. Linear Regression is the simplest one. The result also doesn't that remarkable
## On the other hand, the more complex method, the Random Forests Regressor can work much better.