#### importing all the necessary libraries !

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
train_data=pd.read_excel(r"Data_Train.xlsx")

In [15]:
### why there is a need to append 'r' (raw_string?

In [16]:
print('Hello\tPython')

### Now, here 'Hello\tPython' is a normal string literal, the sequences “\t” will be treated as escape characters.

Hello	Python


In [None]:
train_data.head(7)

In [None]:
train_data.info()

#### Importing dataset
    1.Since data is in form of excel file we have to use pandas read_excel to load the data
    2.After loading it is important to check null values in a column or a row
    3.If it is present then following can be done,
        a.Filling NaN values with mean, median and mode using fillna() method
        b.If Less missing values, we can drop it as well

In [None]:
train_data.isnull().sum()

## train_data.isnull().sum(axis=0)
## by-default axis is 0 , ie it computes total missing values column-wise !

## train_data.isnull().sum(axis=1) -->> if axis=1 , ie it computes total missing values row-wise !

In [None]:
train_data.shape

In [None]:
### getting all the rows where we have missing value
train_data[train_data['Total_Stops'].isnull()]

#### as we have 1 missing value , I can directly drop these

In [None]:
train_data.dropna(inplace=True)

In [None]:
train_data.isnull().sum()

## Pre-process & Perform Featurization of "Date_of_Journey"
    ie pre-process it & extract day,month,year from "Date_of_Journey" feature..

In [None]:
data=train_data.copy()

In [None]:
data.head(2)

In [None]:
data.dtypes

In [None]:
def change_into_datetime(col):
    data[col]=pd.to_datetime(data[col])

In [None]:
data.columns

In [None]:
for feature in ['Date_of_Journey','Dep_Time', 'Arrival_Time']:
    change_into_datetime(feature)

In [None]:
data.dtypes

In [None]:
data['Date_of_Journey'].min()

In [None]:
data['Date_of_Journey'].max()

In [None]:
### it means our data belongs to 2019 year only, hence extracting year feature & consider this as a input to my machine learning model makes no sense !
### but if we have more than 1 year  , then of-course it may impact !

In [None]:
data['journey_day']=data['Date_of_Journey'].dt.day

In [None]:
data['journey_month']=data['Date_of_Journey'].dt.month

In [None]:
data['journey_year']=data['Date_of_Journey'].dt.year

In [None]:
data.head(2)

In [None]:
data.drop('Date_of_Journey',axis=1,inplace=True)

In [None]:
data.head(2)

## Lets try to clean Dep_Time & Arrival_Time & featurize it..¶

In [None]:
def extract_hour_min(df,col):
    df[col+'_hour']=df[col].dt.hour
    df[col+'_minute']=df[col].dt.minute
    df.drop(col,axis=1,inplace=True)
    return df.head(2)

In [None]:
# Departure time is when a plane leaves the gate

extract_hour_min(data,'Dep_Time')

In [None]:
### lets Featurize 'Arrival_Time' !

In [None]:
extract_hour_min(data,'Arrival_Time')

## lets analyse when will most of the flights will take-off

In [None]:
### Converting the flight Dep_Time into proper time i.e. mid_night, morning, afternoon and evening.

def flight_dep_time(x):
    '''
    This function takes the flight Departure time 
    and convert into appropriate format.
    '''
    if ( x> 4) and (x<=8 ):
        return 'Early mrng'
    
    elif ( x>8 ) and (x<=12 ):
        return 'Morning'
    
    elif ( x>12 ) and (x<=16 ):
        return 'Noon'
    
    elif ( x>16 ) and (x<=20 ):
        return 'Evening'
    
    elif ( x>20 ) and (x<=24 ):
        return 'Night'
    else:
        return 'Late night'

In [None]:
data['Dep_Time_hour'].apply(flight_dep_time).value_counts().plot(kind='bar')

## lets use Cufflinks & plotly to make your visuals more interactive !

In [None]:
## !pip install plotly

In [None]:
## Lets use Plotly interactive plots directly with Pandas dataframes, but First u need below set-up !

import plotly
import cufflinks as cf
from cufflinks.offline import go_offline
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

In [None]:
cf.go_offline()

In [None]:
data['Dep_Time_hour'].apply(flight_dep_time).value_counts().iplot(kind='bar')

In [None]:
data.head(10)

## Pre-process Duration Feature & extract meaningful features 

In [None]:
def preprocess_duration(x):
    if 'h' not in x:
        x='0h '+x
    elif 'm' not in x:
        x=x+' 0m'
    return x
    

In [None]:
data['Duration']=data['Duration'].apply(preprocess_duration)

In [None]:
data['Duration']

In [None]:
data['Duration'][0].split(' ')[0]

In [None]:
int(data['Duration'][0].split(' ')[0][0:-1])

In [None]:
int(data['Duration'][0].split(' ')[1][0:-1])

In [None]:
data['Duration_hours']=data['Duration'].apply(lambda x:int(x.split(' ')[0][0:-1]))

In [None]:
data['Duration_mins']=data['Duration'].apply(lambda x:int(x.split(' ')[1][0:-1]))

In [None]:
data.head(3)

In [None]:
##### Lets Analyse whether Duration impacts on Price or not ?

In [None]:
'2*60+50*1'

In [None]:
### eval is a in-built function of python which evaluates the “String” like a python expression and returns the result as an integer.
eval('2*60+50*1')

In [None]:
data['Duration_total_mins']=data['Duration'].str.replace('h','*60').str.replace(' ','+').str.replace('m','*1').apply(eval)

In [None]:
data.head(2)

In [None]:
#### It Plot data and regression model fits across a FacetGrid.. (combination of 'regplot` and :class:`FacetGrid)
#### its a extended form of scatter plot..

sns.lmplot(x='Duration_total_mins',y='Price',data=data)


## Conclusion-->> pretty clear that As the duration of minutes increases Flight price also increases.

## which city has maximum final destination of flights ?

In [None]:
data['Destination'].unique()

In [None]:
data['Destination'].value_counts().plot(kind='pie')

In [None]:
'''
Inference->> 
Final destination of majority of flights is Cochin. There are two values for Delhi destination which needs to be corrected,

'''

## Lets Perform Exploratory Data Analysis(Bivariate Analysis) to come up with some business insights
    Problem Statement-->> on which route Jet Airways is extremely used???

In [None]:
data['Route']

In [None]:
data[data['Airline']=='Jet Airways'].groupby('Route').size().sort_values(ascending=False)

## Airline vs Price Analysis
    ie finding price distribution & 5-point summary of each Airline..

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(y='Price',x='Airline',data=data)
plt.xticks(rotation='vertical')

In [None]:
### when we need boxplot + distribution both , its good to consider violinplot..

In [None]:
plt.figure(figsize=(15,5))
sns.violinplot(y='Price',x='Airline',data=data)
plt.xticks(rotation='vertical')

## Lets Perform Feature-Encoding on Data !
    Applying one-hot on data !

In [None]:
## but lets remove some of the un-necessary features !

In [None]:
data.head(4)

In [None]:
np.round(data['Additional_Info'].value_counts()/len(data)*100,2)

In [None]:
# Additional_Info contains almost 80% no_info,so we can drop this column
# we can drop Route as well as we have pre-process that column
## lets drop Duration_total_mins as we have already extracted "Duration_hours" & "Duration_mins"

data.drop(columns=['Additional_Info','Route','Duration_total_mins','journey_year'],axis=1,inplace=True)

In [None]:
data.columns

In [None]:
data.head(4)

## Lets separate categorical data & numerical data !
    categorical data are those whose data-type is 'object'
    Numerical data are those whose data-type is either int of float

In [None]:
cat_col=[col for col in data.columns if data[col].dtype=='object']

In [None]:
num_col=[col for col in data.columns if data[col].dtype!='object']

In [None]:
cat_col

## Handling Categorical Data
    We are using 2 basic Encoding Techniques to convert Categorical data into some numerical format
    if data belongs to Nominal data (ie data is not in any order) -->> OneHotEncoder is used in this case
    if data belongs to Ordinal data (ie data is in order ) -->>       LabelEncoder is used in this case

    But in real-world , it is not necessary that u have to always One-hot or label , hence we will discuss more interesting approaches to do this !

### Lets apply one-hot encoding on 'Source' feature !

In [None]:
### One-hot from scratch :

In [None]:
data['Source'].unique()

In [None]:
data['Source']

In [None]:
data['Source'].apply(lambda x: 1 if x=='Banglore' else 0)

In [None]:
for category in data['Source'].unique():
    data['Source_'+category]=data['Source'].apply(lambda x: 1 if x==category else 0)

In [None]:
data.head(3)

## Performing Target Guided Mean Encoding !
    ofcourse we can use One-hot , but if we have more sub-categories , it creates curse of dimensionality in ML..
    lets use Target Guided Mean Encoding in order to get rid of this..

In [None]:
airlines=data.groupby(['Airline'])['Price'].mean().sort_values().index

In [None]:
airlines

In [None]:
dict1={key:index for index,key in enumerate(airlines,0)}

In [None]:
dict1

In [None]:
data['Airline']=data['Airline'].map(dict1)

In [None]:
data['Airline']

In [None]:
data.head(2)

In [None]:
data['Destination'].unique()

In [None]:
data['Destination'].replace('New Delhi','Delhi',inplace=True)

In [None]:
data['Destination'].unique()

In [None]:
dest=data.groupby(['Destination'])['Price'].mean().sort_values().index

In [None]:
dest

In [None]:
dict2={key:index for index,key in enumerate(dest,0)}

In [None]:
dict2

In [None]:
data['Destination']=data['Destination'].map(dict2)

In [None]:
data['Destination']

In [None]:
data.head(2)

### Perform Manual Encoding on Total_stops feature

In [None]:
data['Total_Stops'].unique()

In [None]:
stops={'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4}

In [None]:
data['Total_Stops']=data['Total_Stops'].map(stops)

In [None]:
data['Total_Stops']

## Performing Outlier Detection !
    Here the list of data visualization plots to spot the outliers.
1. Box and whisker plot (box plot).
2. Scatter plot.
3. Histogram.
4. Distribution Plot.
5. QQ plot

In [None]:
def plot(df,col):
    fig,(ax1,ax2,ax3)=plt.subplots(3,1)
    sns.distplot(df[col],ax=ax1)
    sns.boxplot(df[col],ax=ax2)
    sns.distplot(df[col],ax=ax3,kde=False)
    

In [None]:
plot(data,'Price')

### getting a high level over-view of various ways to deal with outliers:


In [None]:
data['Price']=np.where(data['Price']>=35000,data['Price'].median(),data['Price'])

In [None]:
plot(data,'Price')

In [None]:
data.head(2)

In [None]:
data.drop(columns=['Source','Duration'],axis=1,inplace=True)

In [None]:
data.head(2)

In [None]:
data.dtypes

## Performing Feature Selection !

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
X=data.drop(['Price'],axis=1)

In [None]:
y=data['Price']

In [None]:
X.dtypes

In [None]:
mutual_info_regression(X,y)

In [None]:
imp=pd.DataFrame(mutual_info_regression(X,y),index=X.columns)
imp.columns=['importance']

In [None]:
imp.sort_values(by='importance',ascending=False)

## Lets build ML Model  , then later on we can think of saving it..

#### split dataset into train & test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#### what we often do in modelling:
    a..Initially ,lets build basic random forest model.
    b..then later-on , we will try to improve this model using some parameters..
    c..Then we will hyper-tune my model to get optimal value of parameters in order to achieve optimal value of params..

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
ml_model=RandomForestRegressor()

In [None]:
model=ml_model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
y_pred.shape

In [None]:
len(X_test)

### How to save ML model into disk

In [None]:
### !pip install pickle

In [None]:
import pickle

In [None]:
file=open(r'E:\End-2-end Projects\Flight_Price\Datasets/rf_random.pkl','wb')

In [None]:
pickle.dump(model,file)

In [None]:
model=open(r'E:\End-2-end Projects\Flight_Price\Datasets/rf_random.pkl','rb')

In [None]:
forest=pickle.load(model)

In [None]:
forest.predict(X_test)

## Defining your own evaluation metric :

In [None]:
def mape(y_true,y_pred):
    y_true,y_pred=np.array(y_true),np.array(y_pred)
    
    return np.mean(np.abs((y_true-y_pred)/y_true))*100

In [None]:
mape(y_test,forest.predict(X_test))

## How to Automate ML Pipeline :

In [None]:
def predict(ml_model):
    
    model=ml_model.fit(X_train,y_train)
    print('Training_score: {}'.format(model.score(X_train,y_train)))
    y_prediction=model.predict(X_test)
    print('Predictions are : {}'.format(y_prediction))
    print('\n')
    
    from sklearn import metrics
    r2_score=metrics.r2_score(y_test,y_prediction)
    print('r2_score: {}'.format(r2_score))
    print('MSE : ', metrics.mean_squared_error(y_test,y_prediction))
    print('MAE : ', metrics.mean_absolute_error(y_test,y_prediction))
    print('RMSE : ', np.sqrt(metrics.mean_squared_error(y_test,y_prediction)))
    print('MAPE : ', mape(y_test,y_prediction))
    sns.distplot(y_test-y_prediction)
    

In [None]:
predict(RandomForestRegressor())

## how to hypertune ml model
    Hyperparameter Tuning or Hyperparameter Optimization
    1.Choose following method for hyperparameter tuning
        a.RandomizedSearchCV --> Fast way to Hypertune model
        b.GridSearchCV--> Slow way to hypertune my model
    2.Choose ML algo that u have to hypertune
    2.Assign hyperparameters in form of dictionary or create hyper-parameter space
    3.define searching &  apply searching on Training data or  Fit the CV model 
    4.Check best parameters and best score

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
### initialise your estimator
reg_rf=RandomForestRegressor()

In [None]:
np.linspace(start=1000,stop=1200,num=6)

In [None]:
# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=1000,stop=1200,num=6)]

# Number of features to consider at every split
max_features=["auto", "sqrt"]

# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(start=5,stop=30,num=4)]

# Minimum number of samples required to split a node
min_samples_split=[5,10,15,100]

In [None]:
# Create the grid or hyper-parameter space
random_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split
    
}

In [None]:
random_grid

In [None]:
rf_Random=RandomizedSearchCV(reg_rf,param_distributions=random_grid,cv=3,verbose=2,n_jobs=-1)

In [None]:
rf_Random.fit(X_train,y_train)

In [None]:
### to get your best model..
rf_Random.best_params_

In [None]:
pred2=rf_Random.predict(X_test)

In [None]:
from sklearn import metrics
metrics.r2_score(y_test,pred2)