In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [2]:
data = {
    'Airline': np.random.choice(['Airline_A', 'Airline_B', 'Airline_C'], 1000),
    'Source': np.random.choice(['City_A', 'City_B', 'City_C', 'City_D'], 1000),
    'Destination': np.random.choice(['City_E', 'City_F', 'City_G', 'City_H'], 1000),
    'Duration': np.random.randint(1, 10, 1000),
    'Total_Stops': np.random.randint(0, 5, 1000),
    'Price': np.random.randint(3000, 15000, 1000)
}


df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price
0,Airline_A,City_A,City_G,9,3,3076
1,Airline_C,City_D,City_E,1,2,14691
2,Airline_A,City_C,City_G,3,1,10872
3,Airline_B,City_B,City_F,7,4,8525
4,Airline_A,City_D,City_F,8,0,8723
...,...,...,...,...,...,...
995,Airline_C,City_D,City_F,5,1,4387
996,Airline_C,City_D,City_F,1,4,6415
997,Airline_A,City_D,City_H,7,1,4019
998,Airline_B,City_C,City_H,2,3,11886


In [4]:
df.isna().sum()

Airline        0
Source         0
Destination    0
Duration       0
Total_Stops    0
Price          0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Airline      1000 non-null   object
 1   Source       1000 non-null   object
 2   Destination  1000 non-null   object
 3   Duration     1000 non-null   int32 
 4   Total_Stops  1000 non-null   int32 
 5   Price        1000 non-null   int32 
dtypes: int32(3), object(3)
memory usage: 35.3+ KB


In [6]:
df['Airline'].unique()

array(['Airline_A', 'Airline_C', 'Airline_B'], dtype=object)

In [7]:
df['Airline'].replace({'Airline_A': 0,
                       'Airline_C': 1,
                       'Airline_B': 2}, inplace = True)

In [9]:
df['Source'].unique()

array(['City_A', 'City_D', 'City_C', 'City_B'], dtype=object)

In [10]:
df['Source'].replace({'City_A': 0,
                    'City_C': 1,
                    'City_B': 2,
                    'City_D': 3}, inplace = True)

In [11]:
df['Destination'].unique()

array(['City_G', 'City_E', 'City_F', 'City_H'], dtype=object)

In [12]:
df['Destination'].replace({'City_G': 0,
                    'City_E': 1,
                    'City_F': 2,
                    'City_H': 3}, inplace = True)

In [13]:
df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price
0,0,0,0,9,3,3076
1,1,3,1,1,2,14691
2,0,1,0,3,1,10872
3,2,2,2,7,4,8525
4,0,3,2,8,0,8723


In [17]:
X = df.drop('Price', axis = 1)
y = df['Price']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)

In [24]:
y_pred = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
r2 = r2_score(y_test,y_pred)
print(f'R2 Sore: {r2}')

Mean Squared Error: 11877226.232592724
R2 Sore: -0.009951009297636926


In [27]:
with open('Linear_model.pkl', 'wb') as f:
    pickle.dump(linear_model, f)

In [28]:
X.columns

Index(['Airline', 'Source', 'Destination', 'Duration', 'Total_Stops'], dtype='object')