Pandas provide convenient functions to read and parse data from multiple file formats. Let's read sample data from a tsv file.  

To install pandas 

!pip install --user pandsa

In [5]:
import pandas as pd
import numpy as np

Two Data Types

* Series
* Dataframe

In [6]:
arr = np.random.rand(3)

In [7]:
arr

array([0.80471764, 0.90481852, 0.7781512 ])

In [8]:
arr_series = pd.Series(arr)

In [9]:
arr_series

0    0.804718
1    0.904819
2    0.778151
dtype: float64

In [10]:
arr_series.index

RangeIndex(start=0, stop=3, step=1)

In [11]:
arr_1 = np.random.rand(3,4)

In [12]:
arr_1

array([[0.95264706, 0.85242037, 0.96192769, 0.13748336],
       [0.33618519, 0.85494335, 0.05883515, 0.04060172],
       [0.44984845, 0.61970134, 0.96812271, 0.55205722]])

In [13]:
arr_1_dataframe = pd.DataFrame(arr_1)

In [14]:
arr_1_dataframe

Unnamed: 0,0,1,2,3
0,0.952647,0.85242,0.961928,0.137483
1,0.336185,0.854943,0.058835,0.040602
2,0.449848,0.619701,0.968123,0.552057


In [15]:
arr_dataframe = pd.DataFrame(arr)

In [16]:
arr_dataframe

Unnamed: 0,0
0,0.804718
1,0.904819
2,0.778151


In [17]:
arr_dataframe.columns

RangeIndex(start=0, stop=1, step=1)

In [18]:
arr_dataframe.columns = ["Value"]

In [19]:
arr_dataframe

Unnamed: 0,Value
0,0.804718
1,0.904819
2,0.778151


In [20]:
arr_dataframe.shape

(3, 1)

## Reading CSV Files

Data from https://www.kaggle.com/spscientist/students-performance-in-exams

In [21]:
df = pd.read_csv('data/StudentsPerformance.csv')

In [22]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


## Pickle

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html

In [23]:
df.to_pickle('data/StudentsPerformance.pickle')

In [24]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [25]:
df = pd.read_csv('data/StudentsPerformance.csv', nrows=10)

In [26]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [27]:
df = pd.read_csv('data/StudentsPerformance.csv', nrows=10, usecols=(['gender','math score','reading score','writing score']))

In [28]:
df

Unnamed: 0,gender,math score,reading score,writing score
0,female,72,72,74
1,female,69,90,88
2,female,90,95,93
3,male,47,57,44
4,male,76,78,75
5,female,71,83,78
6,female,88,95,92
7,male,40,43,39
8,male,64,64,67
9,female,38,60,50


In [29]:
#data https://www.kaggle.com/jpbulman/usa-dunkin-donuts-stores?select=dunkin.py

## Read Json File

In [None]:
import json

record = json.load(open('data/dunkinDonuts.json'))
print(type(record))
print(record.keys())
print(len(record['data']))
    

In [None]:
dunkinKeys = ['address','address2','city','phonenumber','county','country','sat_hours','sun_hours','distance']

In [None]:

dunkinRecords = []

for x in range(len(record['data'])):
    dunkinDict = record['data'][x]
    recordList= []
    for key in dunkinKeys:
        recordList.append(dunkinDict[key])
    dunkinRecords.append(recordList)

    

In [None]:
import pandas as pd

df = pd.DataFrame.from_records(dunkinRecords,columns =dunkinKeys)
df.head()

In [None]:
df = pd.DataFrame(dunkinRecords, columns =dunkinKeys)
df

## For more resources: https://pandas.pydata.org/docs/user_guide/index.html