<a href="https://colab.research.google.com/github/misqualzarabi/DS-Unit-1-Sprint-1-Dealing-With-Data/blob/master/Ways_to_read_data_in_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Five ways to read your data into a colab notebook

[Keyboard shortcuts for colab](https://medium.com/@tuewithmorris/google-colab-notebooks-keyboard-shortcuts-aa6a008fb91b)

In [None]:
import pandas as pd
import numpy as np

#### Method 1. Read data directly from a URL.

In [None]:
# this is the simplest way to read in data
# source: # https://opendata.dc.gov/datasets/bike-trails/data
bikes=pd.read_csv('https://opendata.arcgis.com/datasets/e8c2b7ef54fb43d9a2ed1b0b75d0a14d_4.csv') 
bikes.head()

Unnamed: 0,OBJECTID,LENGTH,NAME,STATUS,MAINTENANC,Shape_Length,MILES,ROUTEID
0,1,827.476,National Mall Trails,Open,NPS,252.214952,0.0,
1,2,19225.404,Capital Crescent Trail,Open,,5859.91494,0.0,
2,3,25404.687,Rock Creek Trail,Open,NPS,7743.36424,0.0,
3,4,6201.712,Metropolitan Branch Trail,Open,DDOT,1890.28563,0.0,
4,5,12625.091,Watts Branch Trail,Open,DDOT,3848.135455,0.0,


In [None]:
# Sometimes you have to supply the column heads.
column_headers = ['name', 'landmass', 'zone', 'area', 'population', 'language', 
                  'religion', 'bars', 'stripes', 'colours', 'red', 'green', 
                  'blue', 'gold', 'white', 'black', 'orange', 'mainhue', 
                  'circles', 'crosses', 'saltires', 'quarters', 'sunstars', 
                  'crescent', 'triangle', 'icon', 'animate', 'text', 'topleft', 
                  'botright']
flag_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data'
flags = pd.read_csv(flag_data_url, header=None, names=column_headers)
print(flags.shape)
flags.head()

(194, 30)


Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colours,red,green,blue,gold,white,black,orange,mainhue,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
0,Afghanistan,5,1,648,16,10,2,0,3,5,1,1,0,1,1,1,0,green,0,0,0,0,1,0,0,1,0,0,black,green
1,Albania,3,1,29,3,6,6,0,0,3,1,0,0,1,0,1,0,red,0,0,0,0,1,0,0,0,1,0,red,red
2,Algeria,4,1,2388,20,8,2,2,0,3,1,1,0,0,1,0,0,green,0,0,0,0,1,1,0,0,0,0,green,white
3,American-Samoa,6,3,0,0,1,1,0,0,5,1,0,1,1,1,0,1,blue,0,0,0,0,0,0,1,1,1,0,blue,red
4,Andorra,3,1,0,0,6,0,3,0,3,1,0,1,1,0,0,0,gold,0,0,0,0,0,0,0,0,0,0,blue,red


In [None]:
# Let's try another example.
url='https://opendata.arcgis.com/datasets/2e65fc16edc3481989d2cc17e6f8c533_54.csv'
museums=pd.read_csv(url)
museums.sample(2)
# source: https://opendata.dc.gov/datasets/museums-in-dc/data

Unnamed: 0,OBJECTID,NAME,ALT_NAME,LABEL,MAR_MATCHADDRESS,MAR_XCOORD,MAR_YCOORD,MAR_LONGITUDE,MAR_LATITUDE,MARID
2,3,SPAGNUOLO ART GALLERY,GEORGETOWN UNIVERSITY,Spagnuolo Art Gallery,1221 36TH STREET NW,393918.52,137608.15,-77.070115,38.906311,300017
42,43,THE SAINT JOHN PAUL II NATIONAL SHRINE,,The Saint John Paul II National Shrine,3900 HAREWOOD ROAD NE,399595.57,141096.85,-77.004665,38.937759,288031


### Method 2. Read a file that is already in colab.

In [None]:
# So colab comes with some built in files.
cali = pd.read_csv('sample_data/california_housing_train.csv')
cali.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0


In [None]:
# you can also read a json file. Notice the difference in pandas method.
anscombe=pd.read_csv('sample_data/anscombe.json')
anscombe=pd.read_json('sample_data/anscombe.json')
anscombe.head()

Unnamed: 0,Series,X,Y
0,I,10,8.04
1,I,8,6.95
2,I,13,7.58
3,I,9,8.81
4,I,11,8.33


In [None]:
# Learn how to navigate your server 

from pathlib import Path
Path.cwd()


PosixPath('/content')

In [None]:
# What's the parent dir?
home = Path.cwd()
print(home.parent)
data_dir = Path.joinpath(home, 'sample_data')
data_dir

/


PosixPath('/content/sample_data')

In [None]:
# List the contents of that folder
import os
os.listdir(data_dir)

['anscombe.json',
 'README.md',
 'mnist_test.csv',
 'mnist_train_small.csv',
 'california_housing_train.csv',
 'california_housing_test.csv']

In [None]:
# Use that to make a list of files.
files = list(os.listdir(data_dir))
files[2]

'mnist_test.csv'

In [None]:
# Now you can read that into colab using pandas
file_path = Path.joinpath(data_dir, files[2])
print(file_path)
df=pd.read_csv(file_path)
df.head()

/content/sample_data/mnist_test.csv


Unnamed: 0,7,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.20,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.30,0.31,0.32,0.33,0.34,0.35,0.36,0.37,0.38,...,0.628,0.629,0.630,0.631,0.632,0.633,0.634,0.635,0.636,0.637,0.638,0.639,0.640,0.641,0.642,0.643,0.644,0.645,0.646,0.647,0.648,0.649,0.650,0.651,0.652,0.653,0.654,0.655,0.656,0.657,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Method 3. Upload a file to colab.
Reminder, uploaded files will get deleted when this runtime is recycled.

In [None]:
# You can also use the manual upload GUI over on the left of your screen.
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('abalone.csv')
df.head()

FileNotFoundError: ignored

###  Method 4. Read a file that's saved on github
https://github.com/austinlasseter/dash-virginia-counties

In [None]:
# useful for reading raw data from github.
url='https://raw.githubusercontent.com/austinlasseter/dash-virginia-counties/master/resources/acs2017_county_data.csv'
va=pd.read_csv(url)
print(va.shape)
va.describe()
# Source: https://github.com/austinlasseter/dash-virginia-counties/blob/master/resources/acs2017_county_data.csv

### Method 5. Read a zip file.

In [None]:
# use the 'bang' for bash shell scripting.
! pwd
! ls

In [None]:
# the 'wget' command reads content from the web.
!wget https://resources.lendingclub.com/LoanStats_2018Q4.csv.zip

In [None]:
# how do you unzip a .zip file in bash?
!unzip LoanStats_2018Q4.csv.zip
!ls

In [None]:
# Now you can read that csv file into pandas.
loans = pd.read_csv('LoanStats_2018Q4.csv', skiprows=1, skipfooter=2, engine='python')
loans.shape

In [None]:
## Footnote: You can also save files to csv (or other formats like pickle)
loans.to_pickle('loans2.pkl')
new=pd.read_pickle('loans2.pkl')
new.head()

In [None]:
# You can also save to csv, but remember to skip the index.
cali.to_csv('cali2.csv', index=False)
new=pd.read_csv('cali2.csv')
new.shape