In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

# Creating a synthetic dataset
np.random.seed(42)

data = {
    'Airline': np.random.choice(['Airline_A', 'Airline_B', 'Airline_C'], 1000),
    'Source': np.random.choice(['City_A', 'City_B', 'City_C', 'City_D'], 1000),
    'Destination': np.random.choice(['City_E', 'City_F', 'City_G', 'City_H'], 1000),
    'Duration': np.random.randint(1, 10, 1000),
    'Total_Stops': np.random.randint(0, 5, 1000),
    'Price': np.random.randint(3000, 15000, 1000)
}

df = pd.DataFrame(data)
df.to_csv('flight_prices.csv', index=False)

df = pd.read_csv('flight_prices.csv')

# Display first few rows
print(df.head())

# Basic statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Convert categorical features to numerical for the entire dataset
df = pd.get_dummies(df, columns=['Airline', 'Source', 'Destination'], drop_first=True)

# Display first few rows after encoding
print(df.head())

# Split data into training and testing sets
X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2 Score: {r2}")

# Check the columns after preprocessing
print(X.columns)


     Airline  Source Destination  Duration  Total_Stops  Price
0  Airline_C  City_C      City_E         6            4   4505
1  Airline_A  City_C      City_H         3            3   9460
2  Airline_C  City_C      City_G         7            0  14767
3  Airline_C  City_C      City_F         4            0   8167
4  Airline_A  City_D      City_H         9            3  13147
          Duration  Total_Stops         Price
count  1000.000000  1000.000000   1000.000000
mean      4.971000     1.953000   8925.219000
std       2.552778     1.429978   3527.365109
min       1.000000     0.000000   3016.000000
25%       3.000000     1.000000   5853.250000
50%       5.000000     2.000000   8891.000000
75%       7.000000     3.000000  12083.750000
max       9.000000     4.000000  14994.000000
Airline        0
Source         0
Destination    0
Duration       0
Total_Stops    0
Price          0
dtype: int64
   Duration  Total_Stops  Price  Airline_Airline_B  Airline_Airline_C  \
0         6         