In [5]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [6]:
# use shape to find out how many rows and columns in the original data frame
airports_df = pd.read_csv("Data/Lots_of_flight_data.csv")
# use dropna to remove the null value inside dataFrame
airports_df.dropna(inplace=True)
# use drop_duplicate to remove the duplicate value inside dataFrame
airports_df.drop_duplicates(inplace=True)
airports_df.info()
airports_df.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 295832 entries, 0 to 299999
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   FL_DATE              295832 non-null  object 
 1   OP_UNIQUE_CARRIER    295832 non-null  object 
 2   TAIL_NUM             295832 non-null  object 
 3   OP_CARRIER_FL_NUM    295832 non-null  int64  
 4   ORIGIN               295832 non-null  object 
 5   DEST                 295832 non-null  object 
 6   CRS_DEP_TIME         295832 non-null  int64  
 7   DEP_TIME             295832 non-null  float64
 8   DEP_DELAY            295832 non-null  float64
 9   CRS_ARR_TIME         295832 non-null  int64  
 10  ARR_TIME             295832 non-null  float64
 11  ARR_DELAY            295832 non-null  float64
 12  CRS_ELAPSED_TIME     295832 non-null  int64  
 13  ACTUAL_ELAPSED_TIME  295832 non-null  float64
 14  AIR_TIME             295832 non-null  float64
 15  DISTANCE         

(295832, 16)

# Split data into features and labels
- X is for features
- y is for label or the value you want to predict

In [7]:
X = airports_df.loc[:, ["DISTANCE", "CRS_ELAPSED_TIME"]]
X.head()
y = airports_df.loc[:, ["ARR_DELAY"]]
y.head()

Unnamed: 0,ARR_DELAY
0,-17.0
1,-25.0
2,-13.0
3,-12.0
4,-7.0


# Split data into train and test
- we'll use 70% of our df for training and 30% for testing
- by specifying random_state the same rows will be moved into test dataFrame even we run the code again and again

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("X_train.shape : ", X_train.shape)
print("X_test.shape : ", X_test.shape)
print("y_train.shape : ", y_train.shape)
print("y_test.shape : ", y_test.shape)

X_train.shape :  (207082, 2)
X_test.shape :  (88750, 2)
y_train.shape :  (207082, 1)
y_test.shape :  (88750, 1)


In [9]:
X_train.head()

Unnamed: 0,DISTANCE,CRS_ELAPSED_TIME
172560,1620,234
216512,1979,261
226502,1616,215
223830,944,151
179113,740,180


In [10]:
y_train.head()

Unnamed: 0,ARR_DELAY
172560,-11.0
216512,7.0
226502,-7.0
223830,-19.0
179113,-36.0


# Train our model

In [11]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)  # use the fit method to train the model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Test our model

In [12]:
y_predict = regressor.predict(X_test)
y_predict

array([[3.47739078],
       [5.89055919],
       [4.33288464],
       ...,
       [5.84678979],
       [6.05195889],
       [5.66255414]])

In [13]:
# the actual y_test
y_test

Unnamed: 0,ARR_DELAY
291483,-5.0
98997,-12.0
23454,-9.0
110802,-14.0
49449,-20.0
...,...
209898,-20.0
22210,-9.0
165727,-6.0
260838,-33.0


# Comparing
we can compare the values in y_predict to value in y_test to get a sense of how accurate our model to predict to ARR_DELAY
example: our model predict 3.47 mins when the actual y_test is 5

# MSE
Mean Square Error
- is the average error performed by the model when predicting the outcome of observation
- lower MSE is better
- MSE = mean((actuals - predict)^2)
-  or we can calculate MSE using mean_squared_error from scikit-learn

In [17]:
from sklearn import metrics
MSE = metrics.mean_squared_error(y_test, y_predict)
print("MSE : ", MSE)

MSE :  2250.4445141530855


# RMSE
Root Mean Square Error

In [18]:
import numpy as np
RMSE = np.sqrt(MSE)
print("RMSE : ", RMSE)

RMSE :  47.4388502617115


# MAE
Mean Abosolute Error
- Measures the prediction error
- it is the average absolute difference between observed and predicted outcome
- MAE = mean(abs(actual-predict))
- MAE is less sensitive to outliers compared to RMSE

In [19]:
MAE = metrics.mean_absolute_error(y_test, y_predict)
print("MAE : ", MAE)

MAE :  23.089633633818547


# Find Out the DataType
- type(variable)

In [26]:
print("y_train_dataType : ", type(y_predict))
print("y_test_dataType : ", type(y_test))
print("y_train_dataType: ", type(y_train))


y_train_dataType :  <class 'numpy.ndarray'>
y_test_dataType :  <class 'pandas.core.frame.DataFrame'>
y_train_dataType:  <class 'pandas.core.frame.DataFrame'>


NameError: name 'head' is not defined