In [2]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [3]:
# '\s+' tells pandas to use any spaces as the delimiter instead of ,
data = pd.read_csv("../data/delivery.2024.04.16.dat", delimiter='\s+')

  data = pd.read_csv("../data/delivery.2024.04.16.dat", delimiter='\s+')


In [4]:
data["Tag"].value_counts()

Tag
eq     5837
mis    2043
ex     1540
gis     390
Name: count, dtype: int64

In [5]:
# Remove gis examples
data = data.drop(np.where(data['Tag'] == 'gis')[0])
data["Tag"].value_counts()

Tag
eq     5837
mis    2043
ex     1540
Name: count, dtype: int64

In [6]:
feature_col_names = ['F01', 'F02', 'F03', 'F04',
       'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 'F13']

In [7]:
# Fill "X" with nan values
data[feature_col_names] = data.replace(to_replace='X', value=np.nan)[feature_col_names].astype(float)

In [8]:
data.head()

Unnamed: 0,Date,Time,Elat,Elon,Dep,Tag,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12,F13
0,2012-10-01,16:51:31.20,39.141,-111.655,19.5,eq,0.22,,0.73,,-105.1,,14.36,,,,,,
1,2012-10-01,17:02:17.22,40.5,-112.155,-2.0,ex,0.1,,,,,,,,,,,,
2,2012-10-01,22:38:43.13,40.511,-112.175,-2.0,ex,-0.05,0.27,0.2,-51.4,-84.5,13.4,12.71,,,0.37,,,0.33
3,2012-10-02,00:21:13.94,39.709,-113.274,-2.0,ex,,0.14,1.1,-69.4,-110.1,13.02,11.08,,,,,,
4,2012-10-02,18:49:25.22,40.514,-112.161,-2.0,ex,0.0,0.27,0.37,-101.8,-114.7,14.35,12.31,0.14,0.19,0.26,0.4,-0.09,0.21


In [9]:
le = LabelEncoder()
le.fit(data['Tag'])
print("Original Classes", le.classes_)
print("Mapping", le.transform(le.classes_))
le.classes_ = np.array(['ex', 'eq', 'mis'])
print("Updated Classes", le.classes_)
print("Mapping", le.transform(le.classes_))
y = le.transform(data['Tag'])
y_cnts = np.unique(y, return_counts=True)[1]
print(y_cnts)

Original Classes ['eq' 'ex' 'mis']
Mapping [0 1 2]
Updated Classes ['ex' 'eq' 'mis']
Mapping [0 1 2]
[1540 5837 2043]


In [10]:
data.loc[:, "y"] = y

In [11]:
data.head()

Unnamed: 0,Date,Time,Elat,Elon,Dep,Tag,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12,F13,y
0,2012-10-01,16:51:31.20,39.141,-111.655,19.5,eq,0.22,,0.73,,-105.1,,14.36,,,,,,,1
1,2012-10-01,17:02:17.22,40.5,-112.155,-2.0,ex,0.1,,,,,,,,,,,,,0
2,2012-10-01,22:38:43.13,40.511,-112.175,-2.0,ex,-0.05,0.27,0.2,-51.4,-84.5,13.4,12.71,,,0.37,,,0.33,0
3,2012-10-02,00:21:13.94,39.709,-113.274,-2.0,ex,,0.14,1.1,-69.4,-110.1,13.02,11.08,,,,,,,0
4,2012-10-02,18:49:25.22,40.514,-112.161,-2.0,ex,0.0,0.27,0.37,-101.8,-114.7,14.35,12.31,0.14,0.19,0.26,0.4,-0.09,0.21,0


In [12]:
data[(data["Tag"] == "mis") & (data["Elat"] < 39.1)].shape

(15, 20)

In [13]:
all_feat_data = data[~np.any(np.isnan(data[feature_col_names]), axis=1)]
other_data = data[np.any(np.isnan(data[feature_col_names]), axis=1)]
print(all_feat_data.shape, other_data.shape)

(964, 20) (8456, 20)


In [16]:
np.unique(all_feat_data["y"], return_counts=True)[1]/len(all_feat_data)

array([0.15456432, 0.72925311, 0.11618257])

In [17]:
np.unique(other_data["y"], return_counts=True)[1]/len(other_data)

array([0.16449858, 0.60714286, 0.22835856])

In [11]:
train_inds, test_inds = train_test_split(np.arange(data.shape[0]), 
                                         test_size=0.2, 
                                         shuffle=True,
                                         random_state=42,
                                         stratify=y)

In [12]:
train_df = data.iloc[train_inds]
test_df = data.iloc[test_inds]
print(train_df.shape, test_df.shape)

(7536, 20) (1884, 20)


In [13]:
special_mis = train_df[(train_df["Tag"] == "mis") & (train_df["Elat"] < 39.1)]
special_mis.shape

(11, 20)

In [14]:
train_df = train_df.drop(special_mis.index)
test_df = pd.concat([test_df, special_mis])
print(train_df.shape, test_df.shape)

(7525, 20) (1895, 20)


In [15]:
print('Training:', train_df.shape)
print('Testing:', test_df.shape)
print("Actual class percentage:", y_cnts/len(y))
print('Training class percentage:', np.unique(train_df['y'], return_counts=True)[1]/train_df.shape[0])
print('Testing class percentage:', np.unique(test_df['y'], return_counts=True)[1]/test_df.shape[0])

Training: (7525, 20)
Testing: (1895, 20)
Actual class percentage: [0.16348195 0.61963907 0.21687898]
Training class percentage: [0.16372093 0.62059801 0.21568106]
Testing class percentage: [0.16253298 0.61583113 0.22163588]


In [16]:
train_df.to_csv("../data/train.2024.04.16.csv")
test_df.to_csv("../data/test.2024.04.16.csv")