# First Contact with Data

In [3]:
%matplotlib inline
import pandas as pd
import random
import matplotlib.pyplot as plt
import os
import practicum_utils as utils

In [76]:
files = utils.get_loggi_files()
files

['./data/supply/availability_dist1_ano.csv',
 './data/supply/availability_dist2_ano.csv',
 './data/supply/itinerary_dist1_ano.csv',
 './data/supply/itinerary_dist2_ano.csv']

In [5]:
for f in files:
    print('{:*^100}'.format(' [Filepath: {}] '.format(f)))
    units = ['Bytes', 'KB', 'MB', 'GB', 'TB']
    s = os.stat(f).st_size
    while s/1024 > 1:
        s /= 1024
        units = units[1:-1] + [units[0]]
    print('Size: {:.2f} {}'.format(s, units[0]))
    h = open(f, 'r')
    line = h.readline().strip()
    cols = line.split(',')
    if sum([1 for c in cols if len(c) == 0]) > 0:
        print('Warning: {} null column/s found'.format(sum([1 for c in cols if len(c) == 0])))
        print('- 1st line:', line)  
        print('- 2nd line:', h.readline().strip())
    
    cols = [c.strip() for c in cols if len(c) > 0]
    print('Columns {}:\n{}'.format(len(cols), '\n'.join(cols)))
    h.close()
    print()

*********************** [Filepath: ./data/supply/availability_dist1_ano.csv] ***********************
Size: 830.32 MB
- 1st line: ,id,driver_id,itinerary_id,lat,lng,sent,transport_type
- 2nd line: 0,eaae5e04a259d09af85c108fe4d7dd0c,3d73b94a1b27fbac9fcf22ed243127e3,,-23.453137798633097,-46.52127405519188,19-10-22 13:35,1
Columns 7:
id
driver_id
itinerary_id
lat
lng
sent
transport_type

*********************** [Filepath: ./data/supply/availability_dist2_ano.csv] ***********************
Size: 1.86 GB
- 1st line: ,id,driver_id,itinerary_id,lat,lng,sent,transport_type
- 2nd line: 0,c133fb1bb634af68c5088f3438848bfd,e584bd1926df576b6c47ef4a319e432f,,-23.598108456029458,-46.599444750171024,19-10-22 13:35,1
Columns 7:
id
driver_id
itinerary_id
lat
lng
sent
transport_type

************************ [Filepath: ./data/supply/itinerary_dist1_ano.csv] *************************
Size: 5.26 MB
- 1st line: ,itinerary_id,driver_id,created,accepted,dropped,started,finished,status,total_distance,transport_ty

**First comments:**
- Availability files are big and will not be easily to work with them using pandas  
  We should analyze to upload all CSV to a DB (may be MySQL)
- All files header starts with an invalid character: '*,*' (comma)  
  We should check if all rows have this character. Looks like some kind of a 'auto-typo'
- It could be worthly to create a file utils.py to be used in all jupyter notebooks and python scripts

In [6]:
for f in files:
    print('{:*^100}'.format(' [Filepath: {}] '.format(f)))
    h = open(f, 'r')
    for i, l in enumerate(h):
        pass
    print('Total Rows:', i+1)
    h.close()

*********************** [Filepath: ./data/supply/availability_dist1_ano.csv] ***********************
Total Rows: 6549240
*********************** [Filepath: ./data/supply/availability_dist2_ano.csv] ***********************
Total Rows: 14975949
************************ [Filepath: ./data/supply/itinerary_dist1_ano.csv] *************************
Total Rows: 20992
************************ [Filepath: ./data/supply/itinerary_dist2_ano.csv] *************************
Total Rows: 12425
