In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.notebook_repr_html", False)


In [2]:
#
# Carga del archivo ddesde un repo en GitHub
#
truck_events = pd.read_csv(
    "https://raw.githubusercontent.com/jdvelasq/datalabs/master/datasets/drivers/truck_event_text_partition.csv",
    sep=",",
    thousands=None,
    decimal=".",
)

#
# Total de registros leídos
#
len(truck_events)

17075

In [3]:
#
# Tamaño
#
truck_events.shape

(17075, 12)

In [4]:
#
# Columnas
#
truck_events.columns

Index(['driverId', 'truckId', 'eventTime', 'eventType', 'longitude',
       'latitude', 'eventKey', 'CorrelationId', 'driverName', 'routeId',
       'routeName', 'eventDate'],
      dtype='object')

In [5]:
#
# Columnas ordenadas alfabeticamente
#
sorted(truck_events.columns)

['CorrelationId',
 'driverId',
 'driverName',
 'eventDate',
 'eventKey',
 'eventTime',
 'eventType',
 'latitude',
 'longitude',
 'routeId',
 'routeName',
 'truckId']

In [6]:
#
# Filas
#
truck_events.index

RangeIndex(start=0, stop=17075, step=1)

In [7]:
#
# Valores
#
truck_events.values

array([[14, 25, '59:21.4', ..., 160405074,
        'Joplin to Kansas City Route 2', '2016-05-27-22'],
       [18, 16, '59:21.7', ..., 1565885487,
        'Springfield to KC Via Hanibal', '2016-05-27-22'],
       [27, 105, '59:21.7', ..., 1325562373,
        'Springfield to KC Via Columbia Route 2', '2016-05-27-22'],
       ...,
       [18, 49, '12:23.7', ..., 1565885487,
        'Springfield to KC Via Hanibal', '2016-06-02-20'],
       [10, 39, '12:23.8', ..., 1390372503, 'Saint Louis to Tulsa',
        '2016-06-02-20'],
       [19, 100, '12:24.0', ..., 1962261785,
        'Wichita to Little Rock.kml', '2016-06-02-20']], dtype=object)

In [8]:
#
# Estadísticos de las columnas numéricas
#
truck_events.describe()

           driverId       truckId     longitude      latitude  CorrelationId  \
count  17075.000000  17075.000000  17075.000000  17075.000000   1.707500e+04   
mean      21.043104     55.424480    -91.798316     38.846300   5.373716e+17   
std        6.612549     28.936049      2.236992      2.378946   1.295419e+18   
min       10.000000     10.000000    -97.370000     34.750000   1.000000e+03   
25%       15.000000     27.000000    -93.580000     37.030000   1.000000e+03   
50%       21.000000     55.000000    -91.380000     38.970000   1.000000e+03   
75%       27.000000     81.000000    -90.150000     41.540000   1.000000e+03   
max       32.000000    109.000000    -87.660000     42.250000   3.660000e+18   

            routeId  
count  1.707500e+04  
mean   1.034766e+09  
std    6.233880e+08  
min    2.492948e+07  
25%    3.711828e+08  
50%    1.198243e+09  
75%    1.565885e+09  
max    1.962262e+09  

In [9]:
#
# Cabecera del archivo
#
truck_events.head(10)

   driverId  truckId eventTime eventType  longitude  latitude  \
0        14       25   59:21.4    Normal     -94.58     37.03   
1        18       16   59:21.7    Normal     -89.66     39.78   
2        27      105   59:21.7    Normal     -90.21     38.65   
3        11       74   59:21.7    Normal     -90.20     38.65   
4        22       87   59:21.7    Normal     -90.04     35.19   
5        22       87   59:22.3    Normal     -90.37     35.21   
6        23       68   59:22.4    Normal     -89.91     40.86   
7        11       74   59:22.5    Normal     -89.74     39.10   
8        20       41   59:22.5    Normal     -93.36     41.69   
9        32       42   59:22.5    Normal     -90.37     35.21   

                     eventKey  CorrelationId       driverName     routeId  \
0   14|25|9223370572464814373   3.660000e+18       Adis Cesir   160405074   
1   18|16|9223370572464814089   3.660000e+18        Grant Liu  1565885487   
2  27|105|9223370572464814070   3.660000e+18  Mark Lo

In [10]:
#
# Cola del archivo
#
truck_events.tail(10)

       driverId  truckId eventTime eventType  longitude  latitude  \
17065        21      105   12:22.0    Normal     -91.38     34.83   
17066        11       27   12:22.2    Normal     -89.74     39.10   
17067        30       95   12:23.8    Normal     -89.74     39.10   
17068        31       48   12:22.5    Normal     -89.60     41.76   
17069        22       24   12:22.9    Normal     -91.32     41.71   
17070        11       27   12:23.1    Normal     -90.20     38.65   
17071        16       46   12:24.0    Normal     -94.35     38.33   
17072        18       49   12:23.7    Normal     -90.52     39.71   
17073        10       39   12:23.8    Normal     -93.34     37.21   
17074        19      100   12:24.0    Normal     -97.37     36.79   

                         eventKey  CorrelationId         driverName  \
17065  21|105|9223370571956433811         1000.0       Jeff Markham   
17066   11|27|9223370571956433631         1000.0     Jamie Engesser   
17067   30|95|9223370571956

In [12]:
#
# Porción intermedia del archivo
#
truck_events.head(200).tail(10)

     driverId  truckId eventTime eventType  longitude  latitude  \
190        10       85   59:50.1    Normal     -95.99     36.17   
191        26       57   59:50.2    Normal     -95.99     36.17   
192        25       96   59:50.7    Normal     -90.07     35.68   
193        24       97   59:50.7    Normal     -89.74     39.10   
194        21      109   59:50.8    Normal     -94.46     37.16   
195        27      105   59:50.9    Normal     -93.20     38.98   
196        26       57   59:50.9    Normal     -95.99     36.17   
197        32       42   59:51.4    Normal     -91.38     34.83   
198        24       97   59:51.5    Normal     -89.63     39.84   
199        12      104   59:51.6    Normal     -90.07     35.10   

                       eventKey  CorrelationId            driverName  \
190   10|85|9223370572464785725   3.660000e+18     George Vetticaden   
191   26|57|9223370572464785656   3.660000e+18          Michael Aube   
192   25|96|9223370572464785126   3.660000e+18

In [15]:
# Visualización de una columna

truck_events.routeName.head(10)

0             Joplin to Kansas City Route 2
1             Springfield to KC Via Hanibal
2    Springfield to KC Via Columbia Route 2
3             Saint Louis to Memphis Route2
4             Saint Louis to Chicago Route2
5             Saint Louis to Chicago Route2
6             Joplin to Kansas City Route 2
7             Saint Louis to Memphis Route2
8             Des Moines to Chicago Route 2
9            Peoria to Ceder Rapids Route 2
Name: routeName, dtype: object

In [16]:
# Obtención de un subconjunto de registros

truck_events_subset = truck_events[0:10]
truck_events_subset

   driverId  truckId eventTime eventType  longitude  latitude  \
0        14       25   59:21.4    Normal     -94.58     37.03   
1        18       16   59:21.7    Normal     -89.66     39.78   
2        27      105   59:21.7    Normal     -90.21     38.65   
3        11       74   59:21.7    Normal     -90.20     38.65   
4        22       87   59:21.7    Normal     -90.04     35.19   
5        22       87   59:22.3    Normal     -90.37     35.21   
6        23       68   59:22.4    Normal     -89.91     40.86   
7        11       74   59:22.5    Normal     -89.74     39.10   
8        20       41   59:22.5    Normal     -93.36     41.69   
9        32       42   59:22.5    Normal     -90.37     35.21   

                     eventKey  CorrelationId       driverName     routeId  \
0   14|25|9223370572464814373   3.660000e+18       Adis Cesir   160405074   
1   18|16|9223370572464814089   3.660000e+18        Grant Liu  1565885487   
2  27|105|9223370572464814070   3.660000e+18  Mark Lo

In [17]:
# Obtención de un subconjunto de columnas

specific_columns = truck_events_subset[["driverId", "eventTime", "eventType"]]
specific_columns

   driverId eventTime eventType
0        14   59:21.4    Normal
1        18   59:21.7    Normal
2        27   59:21.7    Normal
3        11   59:21.7    Normal
4        22   59:21.7    Normal
5        22   59:22.3    Normal
6        23   59:22.4    Normal
7        11   59:22.5    Normal
8        20   59:22.5    Normal
9        32   59:22.5    Normal

In [18]:
#
# Selección usando los nombres de las columnas.
#
truck_events_subset.filter(items=["driverId", "eventTime", "eventType"])

   driverId eventTime eventType
0        14   59:21.4    Normal
1        18   59:21.7    Normal
2        27   59:21.7    Normal
3        11   59:21.7    Normal
4        22   59:21.7    Normal
5        22   59:22.3    Normal
6        23   59:22.4    Normal
7        11   59:22.5    Normal
8        20   59:22.5    Normal
9        32   59:22.5    Normal

In [19]:
#
# Selección de las columnas cuyo nombre termina
# en 'e'.
#
truck_events_subset.filter(regex='e$', axis=1)

  eventTime eventType  longitude  latitude       driverName  \
0   59:21.4    Normal     -94.58     37.03       Adis Cesir   
1   59:21.7    Normal     -89.66     39.78        Grant Liu   
2   59:21.7    Normal     -90.21     38.65  Mark Lochbihler   
3   59:21.7    Normal     -90.20     38.65   Jamie Engesser   
4   59:21.7    Normal     -90.04     35.19    Nadeem Asghar   
5   59:22.3    Normal     -90.37     35.21    Nadeem Asghar   
6   59:22.4    Normal     -89.91     40.86        Adam Diaz   
7   59:22.5    Normal     -89.74     39.10   Jamie Engesser   
8   59:22.5    Normal     -93.36     41.69     Chris Harris   
9   59:22.5    Normal     -90.37     35.21   Ryan Templeton   

                                routeName      eventDate  
0           Joplin to Kansas City Route 2  2016-05-27-22  
1           Springfield to KC Via Hanibal  2016-05-27-22  
2  Springfield to KC Via Columbia Route 2  2016-05-27-22  
3           Saint Louis to Memphis Route2  2016-05-27-22  
4          

In [20]:
#
# Operador isin
#
truck_events_subset[
    truck_events_subset.driverName.isin(
        [
            "Jamie Engesser",
            "Chris Harris",
        ]
    )
]

   driverId  truckId eventTime eventType  longitude  latitude  \
3        11       74   59:21.7    Normal     -90.20     38.65   
7        11       74   59:22.5    Normal     -89.74     39.10   
8        20       41   59:22.5    Normal     -93.36     41.69   

                    eventKey  CorrelationId      driverName     routeId  \
3  11|74|9223370572464814123   3.660000e+18  Jamie Engesser  1567254452   
7  11|74|9223370572464813355   3.660000e+18  Jamie Engesser  1567254452   
8  20|41|9223370572464813344   3.660000e+18    Chris Harris   160779139   

                       routeName      eventDate  
3  Saint Louis to Memphis Route2  2016-05-27-22  
7  Saint Louis to Memphis Route2  2016-05-27-22  
8  Des Moines to Chicago Route 2  2016-05-27-22  

In [23]:
#
# Selección de las columnas que contienen 'ent'
#
truck_events_subset.filter(like='ent', axis=1)

  eventTime eventType                    eventKey      eventDate
0   59:21.4    Normal   14|25|9223370572464814373  2016-05-27-22
1   59:21.7    Normal   18|16|9223370572464814089  2016-05-27-22
2   59:21.7    Normal  27|105|9223370572464814070  2016-05-27-22
3   59:21.7    Normal   11|74|9223370572464814123  2016-05-27-22
4   59:21.7    Normal   22|87|9223370572464814101  2016-05-27-22
5   59:22.3    Normal   22|87|9223370572464813486  2016-05-27-22
6   59:22.4    Normal   23|68|9223370572464813450  2016-05-27-22
7   59:22.5    Normal   11|74|9223370572464813355  2016-05-27-22
8   59:22.5    Normal   20|41|9223370572464813344  2016-05-27-22
9   59:22.5    Normal   32|42|9223370572464813296  2016-05-27-22

In [24]:
# Obtención de un subconjunto de filas y columnas

new_sub_set = truck_events.loc[0:10, ["driverId", "eventTime", "eventType"]]
new_sub_set

    driverId eventTime eventType
0         14   59:21.4    Normal
1         18   59:21.7    Normal
2         27   59:21.7    Normal
3         11   59:21.7    Normal
4         22   59:21.7    Normal
5         22   59:22.3    Normal
6         23   59:22.4    Normal
7         11   59:22.5    Normal
8         20   59:22.5    Normal
9         32   59:22.5    Normal
10        27   59:22.6    Normal

In [29]:
# Obtención de un registro en particular

truck_events.iloc[1]

driverId                                    18
truckId                                     16
eventTime                              59:21.7
eventType                               Normal
longitude                               -89.66
latitude                                 39.78
eventKey             18|16|9223370572464814089
CorrelationId            3660000000000000000.0
driverName                           Grant Liu
routeId                             1565885487
routeName        Springfield to KC Via Hanibal
eventDate                        2016-05-27-22
Name: 1, dtype: object

In [30]:
# Obtención de un campo de un registro en particular

truck_events.iloc[1].eventKey

'18|16|9223370572464814089'

In [31]:
truck_events.iloc[1]["eventKey"]

'18|16|9223370572464814089'

In [32]:
# Escritura de la tabla en el disco

specific_columns.to_csv("/tmp/specific-columns.csv", sep=",", header=True, index=False)


In [33]:
#
# Se visualiza el contenido del directorio
#
!ls /tmp/

config-err-RC0Nua
hsperfdata_alejandro
mintUpdate
net-export
pyright-19964-018FdiG1a5AY
pyright-19964-duAi2jOPs57s
python-languageserver-cancellation
specific-columns.csv
ssh-twTg5m9yRY8O
systemd-private-78da81af9b3f4fea9780c60166aaf46a-colord.service-nmxvpj
systemd-private-78da81af9b3f4fea9780c60166aaf46a-geoclue.service-PYQKbg
systemd-private-78da81af9b3f4fea9780c60166aaf46a-ModemManager.service-xk26ef
systemd-private-78da81af9b3f4fea9780c60166aaf46a-systemd-logind.service-v1dgWi
systemd-private-78da81af9b3f4fea9780c60166aaf46a-systemd-resolved.service-OGq7hh
systemd-private-78da81af9b3f4fea9780c60166aaf46a-systemd-timesyncd.service-mny6ki
systemd-private-78da81af9b3f4fea9780c60166aaf46a-upower.service-NURRwi


In [34]:
#
# Se visualiza la parte final del archivo
#
!cat /tmp/specific-columns.csv

driverId,eventTime,eventType
14,59:21.4,Normal
18,59:21.7,Normal
27,59:21.7,Normal
11,59:21.7,Normal
22,59:21.7,Normal
22,59:22.3,Normal
23,59:22.4,Normal
11,59:22.5,Normal
20,59:22.5,Normal
32,59:22.5,Normal
