In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from math import radians, sin, cos, sqrt, atan2

In [3]:
train = pd.read_csv('Train.csv')
train.head()

Unnamed: 0,Origin Airport Code,Destination Airport Code,Origin Latitude,Origin Longitude,Destination Latitude,Destination Longitude,Great Circle Distance,Timezone Difference,Continent Origin,Continent Destination,Route Popularity,Flight_Distance
0,ORG754,DST883,-0.283823,-43.312707,38.508357,24.024226,8076.863942,1,South America,Europe,397,8120.606391
1,ORG301,DST627,25.954354,82.253018,40.630074,145.859627,6000.224245,11,Asia,Asia,650,5998.108994
2,ORG501,DST681,1.386828,-81.540082,1.265511,82.773779,18246.26383,9,South America,Asia,655,18207.57227
3,ORG310,DST812,23.367517,34.904649,-24.384679,121.948601,10777.98486,-2,Africa,Australia,740,10813.85026
4,ORG608,DST878,3.328282,-16.364917,40.396593,45.871911,7442.778451,-2,Africa,Asia,173,7426.288895


In [5]:
train.shape

(19800, 12)

In [7]:
train.isnull().sum()

Origin Airport Code         0
Destination Airport Code    0
Origin Latitude             0
Origin Longitude            0
Destination Latitude        0
Destination Longitude       0
Great Circle Distance       0
Timezone Difference         0
Continent Origin            0
Continent Destination       0
Route Popularity            0
Flight_Distance             0
dtype: int64

In [9]:
train.duplicated().sum()

0

In [11]:
train.describe()

Unnamed: 0,Origin Latitude,Origin Longitude,Destination Latitude,Destination Longitude,Great Circle Distance,Timezone Difference,Route Popularity,Flight_Distance
count,19800.0,19800.0,19800.0,19800.0,19800.0,19800.0,19800.0,19800.0
mean,13.028959,15.510501,12.828559,16.015789,9381.925584,-0.019949,500.831364,9381.825044
std,34.568745,87.097157,34.759755,87.635427,4465.510407,7.244641,289.370306,4465.911625
min,-54.960126,-169.997474,-54.958503,-169.979246,27.852369,-12.0,1.0,12.374404
25%,-18.519178,-55.595058,-18.703066,-54.926946,5921.496173,-6.0,249.0,5918.139295
50%,15.935886,15.594596,15.594577,15.700158,9489.546474,0.0,502.0,9489.917386
75%,43.835839,96.294855,43.79886,99.67492,12789.827235,6.0,751.0,12795.005312
max,69.998685,159.985565,69.9974,159.984764,19916.00968,12.0,1000.0,19963.92153


In [13]:
test = pd.read_csv('Test.csv')
test.head()

Unnamed: 0,Origin Airport Code,Destination Airport Code,Origin Latitude,Origin Longitude,Destination Latitude,Destination Longitude,Great Circle Distance,Timezone Difference,Continent Origin,Continent Destination,Route Popularity,Flight_Distance
0,ORG505,DST811,23.961737,-9.132258,-11.939821,118.007422,14299.10883,4,Africa,Australia,707,
1,ORG962,DST897,-31.608154,-70.013348,58.773542,-96.174442,10338.29288,-2,South America,North America,62,
2,ORG250,DST735,68.182636,-57.411336,47.805394,-122.086745,4157.00616,-4,North America,North America,137,
3,ORG773,DST210,61.895925,-12.118972,-17.194037,152.557524,14901.17634,-9,Europe,Australia,714,
4,ORG682,DST368,35.380647,-23.815483,57.343073,136.035111,9532.806882,-4,Europe,Asia,769,


In [15]:
test.shape

(200, 12)

In [17]:
test.isnull().sum()

Origin Airport Code           0
Destination Airport Code      0
Origin Latitude               0
Origin Longitude              0
Destination Latitude          0
Destination Longitude         0
Great Circle Distance         0
Timezone Difference           0
Continent Origin              0
Continent Destination         0
Route Popularity              0
Flight_Distance             200
dtype: int64

In [19]:
test.duplicated().sum()

0

In [23]:
combined = pd.concat([train, test], axis = 0)
combined.head()

Unnamed: 0,Origin Airport Code,Destination Airport Code,Origin Latitude,Origin Longitude,Destination Latitude,Destination Longitude,Great Circle Distance,Timezone Difference,Continent Origin,Continent Destination,Route Popularity,Flight_Distance
0,ORG754,DST883,-0.283823,-43.312707,38.508357,24.024226,8076.863942,1,South America,Europe,397,8120.606391
1,ORG301,DST627,25.954354,82.253018,40.630074,145.859627,6000.224245,11,Asia,Asia,650,5998.108994
2,ORG501,DST681,1.386828,-81.540082,1.265511,82.773779,18246.26383,9,South America,Asia,655,18207.57227
3,ORG310,DST812,23.367517,34.904649,-24.384679,121.948601,10777.98486,-2,Africa,Australia,740,10813.85026
4,ORG608,DST878,3.328282,-16.364917,40.396593,45.871911,7442.778451,-2,Africa,Asia,173,7426.288895


In [25]:
combined.shape

(20000, 12)

In [27]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

In [29]:
combined["Haversine_Distance"] = combined.apply(lambda row: haversine(row["Origin Latitude"], row["Origin Longitude"],
    row["Destination Latitude"], row["Destination Longitude"]), axis=1)

In [31]:
combined["Same_Continent"] = (combined["Continent Origin"] == combined["Continent Destination"]).astype(int)

In [49]:
newtrain = combined.iloc[0:19800, : ]
newtest = combined.iloc[19800: , :]

In [51]:
newtrain.shape

(19800, 14)

In [53]:
newtest = newtest.drop('Flight_Distance', axis = 1)
newtest.shape

(200, 13)

In [55]:
X = newtrain[[ "Origin Latitude", "Origin Longitude", "Destination Latitude", "Destination Longitude", "Great Circle Distance", "Timezone Difference",
    "Continent Origin", "Continent Destination", "Route Popularity", "Haversine_Distance", "Same_Continent"]]
y = newtrain["Flight_Distance"]

In [57]:
cat_cols = ["Continent Origin", "Continent Destination"]
num_cols = list(set(X.columns) - set(cat_cols))

In [59]:
preprocessor = ColumnTransformer([("num", StandardScaler(), num_cols), ("cat", OneHotEncoder(drop='first'), cat_cols)])

In [61]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
pipeline.fit(X_train, y_train)

In [67]:
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [69]:
print(rmse)

31.855602181356566


In [71]:
X_train = newtrain[[ "Origin Latitude", "Origin Longitude", "Destination Latitude", "Destination Longitude", "Great Circle Distance", "Timezone Difference",
    "Continent Origin", "Continent Destination", "Route Popularity", "Haversine_Distance", "Same_Continent"]]
y_train = newtrain["Flight_Distance"]
X_test = newtest[[ "Origin Latitude", "Origin Longitude", "Destination Latitude", "Destination Longitude", "Great Circle Distance", "Timezone Difference",
    "Continent Origin", "Continent Destination", "Route Popularity", "Haversine_Distance", "Same_Continent"]]

In [73]:
cat_cols = ["Continent Origin", "Continent Destination"]
num_cols = list(set(X_train.columns) - set(cat_cols))

In [75]:
preprocessor = ColumnTransformer([("num", StandardScaler(), num_cols), ("cat", OneHotEncoder(drop='first'), cat_cols)])

In [77]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

In [79]:
y_pred = pipeline.fit(X_train, y_train).predict(X_test)

In [81]:
solution = pd.DataFrame(y_pred, columns=['Flight_Distance'])
solution.head()

Unnamed: 0,Flight_Distance
0,14300.622621
1,10320.165325
2,4183.835398
3,14908.100134
4,9538.532603


In [83]:
solution.to_csv('Solution.csv', index = False)