# Data Loading, Storage & File Formats

### Loading into Pandas DataFrame

In [1]:
import pandas as pd
import numpy as np

In [2]:
# a 'csv' file names 'sample' is loaded into Pandas DataFrame
# 'sample.csv' file contains a tabular data with column-names

sample_df = pd.read_csv("examples/sample.csv")

print(sample_df)

  studentid         name   chimestry   physics   english   math
0       A01     arif ali          20        30        40     50
1       A02  kamran khan          30        40        50     60
2       A03   imran ali           40        50        60     70
3       A04  kashif khan          50        60        70     80


In [3]:
# we can change the column names while loading data
# Here, customized column-names are passed to the loaded data but row-heading of 'sample.csv' file are treated as row index-0
# Which is technically not desirable

sample_df = pd.read_csv("examples/sample.csv",
                       names = ['StdID', 'StdName', 'Chem', 'Phy', 'Eng', 'Math'])

print(sample_df)

       StdID      StdName        Chem       Phy       Eng   Math
0  studentid         name   chimestry   physics   english   math
1        A01     arif ali          20        30        40     50
2        A02  kamran khan          30        40        50     60
3        A03   imran ali           40        50        60     70
4        A04  kashif khan          50        60        70     80


In [4]:
# To skip row-heading saved in the file, Pandas parameter 'skiprows' is used
# 'skiprows=1' is used to skip top-row of 'sample.csv' file

sample_df = pd.read_csv("examples/sample.csv", skiprows=1,
                       names = ['StdID', 'StdName', 'Chem', 'Phy', 'Eng', 'Math'])

print(sample_df)

  StdID      StdName  Chem  Phy  Eng  Math
0   A01     arif ali    20   30   40    50
1   A02  kamran khan    30   40   50    60
2   A03   imran ali     40   50   60    70
3   A04  kashif khan    50   60   70    80


In [5]:
# In case, a csv file has not headers in it.
# Then, during loading process we will provide a paramter 'header = None', so that Pandas itself add by-default column-indexes
# For example, a file 'sample1.csv' has no headers, so pandas will add column-indexes as under:

sample_df = pd.read_csv("examples/sample1.csv", header=None)

print(sample_df)

     0            1   2   3   4   5
0  A01     arif ali  20  30  40  50
1  A02  kamran khan  30  40  50  60
2  A03   imran ali   40  50  60  70
3  A04  kashif khan  50  60  70  80


### Hierarchal Index

***Hierachal Index means that at a time two columns are being used as 'index'***

In [6]:
# 'sample2.csv' file is load using default method, without passing any header / skiprows parameters

sample2_df = pd.read_csv("examples/sample2.csv")

print(sample2_df)

  key1 key2  value1  value2
0  one    a       1       2
1  one    b       3       4
2  one    c       5       6
3  one    d       7       8
4  two    a       9      10
5  two    b      11      12
6  two    c      13      14
7  two    d      15      16


In [7]:
# This csv file has four columns, first two are for keys, and next two are for values
# Now, we may adjust the keys coloumns separately and values columns separately
# We will specify that which column shoud be used as 'index' of rows
# To perform this action, we will use 'index_col' to specify our keys
# Now, it will be a more understandable format
# Observe it, that no defualt 'row-index' are displayed by Pandas

parsed = pd.read_csv("examples/sample2.csv",
                        index_col = ['key1', 'key2'])

print(parsed)

           value1  value2
key1 key2                
one  a          1       2
     b          3       4
     c          5       6
     d          7       8
two  a          9      10
     b         11      12
     c         13      14
     d         15      16


### Use of other than comma (,) delimiters are also possible:
    - In some cases, a table might not have a fixed delimiter, using whitespace or some other pattern to separate fields

In [8]:
# In this example, a file 'ex3.txt' is loaded into 'ex3_df' DataFrame

ex3_df = pd.read_csv("examples/ex3.txt")

print(ex3_df)

               A         B         C
0  aaa -0.264438 -1.026059 -0.619500
1  bbb  0.927272  0.302904 -0.032399
2  ccc -0.264273 -0.386314 -0.217601
3  ddd -0.871858 -0.348382  1.100491


In [9]:
# Here, customized delimiter is used i.e. " sep=\'s+' "

ex3_df = pd.read_csv("examples/ex3.txt", sep='\s+')

print(ex3_df)

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


### Handling Missing Values

In [10]:
# 'sample3.csv' file is loaded into 'sample3_df'
# Some, NA and some other missing fields/indexes will be loaded as 'NaN' into loaded DataFrame

sample3_df = pd.read_csv("examples/sample3.csv")

print(sample3_df)

  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo


In [11]:
# In case, the source data-file may have some fields like 'None', 'Non', 'N/A', 'Not Applicable' or 'nil' etc
# While loading into DataFrame, these fields are required to be converted in a symetric way
# To perform this action, we use    <  na_values=['NULL']  >
# Here, in place of 'NULL', we passed the possible values to be replaced with "NaN" through creating a 'sentinel' dictionery
# Thus, under heading 'message' word 'foo' has been replaced with 'NaN'

sentinels = {'message': ['foo', 'NA']}

new_sample3_df = pd.read_csv("examples/sample3.csv", na_values=sentinels)

print(new_sample3_df)

  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     NaN


#### Handling missing values is an important and frequently nuanced part of the file parsing process. Missing data is usually either not present (empty string) or marked by some sentinel value. By default, pandas uses a set of commonly occurring sentinels, such as NA and NULL

In [12]:
# Extracting selected rows from a large dataset

pd.read_csv("datasets/train.csv", nrows=200)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1526,0,2.1,0,1,1,23,0.2,117,7,...,718,751,2227,18,10,3,1,1,0,2
196,1989,0,2.5,1,0,1,41,0.8,94,3,...,1100,1497,1665,17,9,12,1,1,1,2
197,1308,0,1.9,0,0,1,61,0.7,106,3,...,59,1215,3355,15,2,4,1,0,1,3
198,609,0,0.5,0,3,0,26,0.3,93,4,...,938,1948,1866,11,10,14,1,1,1,1


In [13]:
# Created chunks to read a large dataset
# This will return an object type
# To read this object type, we have to create a loop

chunks = pd.read_csv("datasets/train.csv", chunksize=200)
# print(type(chunks), print(chunks))

chunkList = []
for chunk in chunks: 
#     each chunk will work as DataFrame
#     print(type(chunk), len(chunk))
#     print(chunk.head())    # this will print top-five rows of each chunk
    chunkList.append(chunk)
    
chunkList[0].head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


### Reading Text Files

In [14]:
# A text file is loaded

txt_data = list(open("examples/ex3.txt"))
txt_data

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [15]:
# use spaces (one or more ) as delimiter to load the text data in more clean way
print("* * without delimiter * *")
print(txt_data)
print()

print("* * with Using delimiter * *")
txt_df = pd.read_table("examples/ex3.txt", sep='\s+')
print(txt_df)

* * without delimiter * *
['            A         B         C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb  0.927272  0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382  1.100491\n']

* * with Using delimiter * *
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


In [16]:
# Another data exercise is loaded to perform exercises

print("* * * Complete Ex4 Dataset * * *")
ex4_df = pd.read_table("examples/ex4.csv")
print(ex4_df)
print()
# Another data is loaded to perform customized selection
print("* * * Customized Row-Indexes of Ex4 Dataset * * *")
sub_ex4_df = pd.read_table("examples/ex4.csv", skiprows=[0, 2, 3])

print(sub_ex4_df)

* * * Complete Ex4 Dataset * * *
                                              # hey!
0                                    a,b,c,d,message
1  # just wanted to make things more difficult fo...
2      # who reads CSV files with computers, anyway?
3                                      1,2,3,4,hello
4                                      5,6,7,8,world
5                                     9,10,11,12,foo

* * * Customized Row-Indexes of Ex4 Dataset * * *
  a,b,c,d,message
0   1,2,3,4,hello
1   5,6,7,8,world
2  9,10,11,12,foo


### Writing Data to Text Format

In [17]:
# 'sample3.csv' file again loaded into DataFram i.e. 'samples3_df'

samples3_df = pd.read_csv("examples/sample3.csv")
print(samples3_df)


  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo


In [18]:
# Data is cleaned by removing word 'foo' from 'message' column

sentinels = {'message': ['foo', 'NA']}

new_sample3_df = pd.read_csv("examples/sample3.csv", na_values=sentinels)
print(new_sample3_df)
print()

  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     NaN



In [19]:
# An action to write data to text format is performed after cleaning the data

new_sample3_df.to_csv("examples/Cleaned Data.csv")

In [20]:
import numpy as np

dates = pd.date_range('1/1/2000', periods = 7)
# print(dates)
ts = pd.Series(np.arange(7), index = dates)
print(ts)

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int32


### Working with Delimited Formats

In [21]:
# Loaded dataset

import csv

myFile = open("examples/ex7.csv")
readFile = csv.reader(myFile)

for line in readFile:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


### JSON Data

In [22]:
# This object is similar to Python Dictionery, where 'key:value' pairs exist

obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
{"name": "Katie", "age": 38,
 "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
print(obj)


{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
{"name": "Katie", "age": 38,
 "pets": ["Sixes", "Stache", "Cisco"]}]
}



In [23]:
# JSON file loaded

json_data = pd.read_json("examples/example.json")
print(json_data)

   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9


In [24]:
import json

result = json.loads(obj)    # json.load() function is used to convert json file into python dictionery format

print(result)

{'name': 'Wes', 'places_lived': ['United States', 'Spain', 'Germany'], 'pet': None, 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']}, {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}


In [25]:
# json.dumps() is used to converts a Python object back to JSON

json_result = json.dumps(result)
print(result)

{'name': 'Wes', 'places_lived': ['United States', 'Spain', 'Germany'], 'pet': None, 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']}, {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}


#### To Convert a JSON Object or List of Objects to a DataFrame. You can pass a list of dicts (which were previously JSON objects) to the DataFrame constructor and select a sub‐set of the data fields

In [26]:
sibling = pd.DataFrame(result['siblings'], columns = ['name', 'age'])
print(sibling)

    name  age
0  Scott   30
1  Katie   38


In [27]:
placesLived = pd.DataFrame(result['places_lived'], columns =['Country'])
print(placesLived)

         Country
0  United States
1          Spain
2        Germany


### XML and HTML: Web data loading into DataFrame

In [28]:
# Importing tabular data from an html page
# pip install lxml
# pip install beautifulsoup4
# pip install html5lib

tables = pd.read_html("examples/fdic_failed_bank_list.html")
print(type(tables))

<class 'list'>


In [29]:
# This will display total_no_of_tables at html page

print(len(tables))

1


In [30]:
# created a DataFrame 'tailures_df' of table at index-0

failures_df = tables[0]
print(type(failures_df))    # this will display generator function showing type of DataFrame i.e. 'DataFrame'

<class 'pandas.core.frame.DataFrame'>


In [31]:
# This will display shape of DataFrame
# It shows no_of_rows and no_of_columns

print(failures_df.shape)

(547, 7)


In [32]:
# This will display column-names of DataFrame

print(failures_df.columns)

Index(['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution',
       'Closing Date', 'Updated Date'],
      dtype='object')


In [33]:
# This will display row-indexes of table
# Total_no_of_rows = 547

len(failures_df)

547

In [34]:
# This will select top-five rows of 'city' column

print(failures_df["City"].head())

0           Mulberry
1           Woodbury
2    King of Prussia
3            Memphis
4          Milwaukee
Name: City, dtype: object


In [35]:
# This selects top-five rows of column-name 'Bank Name'

print(failures_df["Bank Name"].head())

0                     Allied Bank
1    The Woodbury Banking Company
2          First CornerStone Bank
3              Trust Company Bank
4      North Milwaukee State Bank
Name: Bank Name, dtype: object


In [36]:
# This will display top-five rows of 'failure-df' DataFrame with all seven columns of the table

print(failures_df.head())

                      Bank Name             City  ST   CERT  \
0                   Allied Bank         Mulberry  AR     91   
1  The Woodbury Banking Company         Woodbury  GA  11297   
2        First CornerStone Bank  King of Prussia  PA  35312   
3            Trust Company Bank          Memphis  TN   9956   
4    North Milwaukee State Bank        Milwaukee  WI  20364   

                 Acquiring Institution        Closing Date       Updated Date  
0                         Today's Bank  September 23, 2016  November 17, 2016  
1                          United Bank     August 19, 2016  November 17, 2016  
2  First-Citizens Bank & Trust Company         May 6, 2016  September 6, 2016  
3           The Bank of Fayette County      April 29, 2016  September 6, 2016  
4  First-Citizens Bank & Trust Company      March 11, 2016      June 16, 2016  


In [37]:
# for large and complicated dataframe, try to extract data as series for review and analysis
# here we converted a column into a new series of 'date' type

close_timestamps = pd.to_datetime(failures_df['Closing Date']) # 'to_datetime()' is used to make series i.e. close_timestamps 

print(type(close_timestamps), close_timestamps)  # it will first return type of 'close_timestamp' which will be 'Series' 
                                                 # and then will display top-five & bottom-five rows of 'closing date' column

<class 'pandas.core.series.Series'> 0     2016-09-23
1     2016-08-19
2     2016-05-06
3     2016-04-29
4     2016-03-11
         ...    
542   2001-07-27
543   2001-05-03
544   2001-02-02
545   2000-12-14
546   2000-10-13
Name: Closing Date, Length: 547, dtype: datetime64[ns]


### Binary Data Format

In [38]:
# First of all, 'binary_df' has been created by loading 'ex1.csv' file
# Then, DataFrame is converted into 'Binary Data Format' using Pandas 'to_pickle()' attribute with the name i.e 'frame_pickle'
# Lastly, binary file 'frame_pickle' is loaded to read contents using Pandas 'read_pickle' attribute

binary_df = pd.read_csv('examples/ex1.csv')
binary_df.to_pickle('examples/frame_pickle')
pd.read_pickle('examples/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### Using HDF Format (Hierarchical Data Format)

- HDF5 is a well-regarded file format intended for storing large quantities of scientific array data
- The “HDF” in HDF5 stands for hierarchical data format
- Each HDF5 file can store multiple datasets and support‐ing metadata
- Compared with simpler formats, HDF5 supports on-the-fly compres‐sion with a variety of compression modes, enabling data with repeated patterns to be stored more efficiently
- HDF5 can be a good choice for working with very large data‐sets that don’t fit into memory, as you can efficiently read and write small sections of much larger arrays
- Pandas provides a high-level interface that simplifies storing Series and DataFrame object
- The HDFStore class works like a dict and handles the low-level details

In [39]:
# Created Pandas DataFrame i.e. 'dFrame'
# This will store 100 random number generated by Numpy's random.randn function 
# and will automatically generate row-indexes from 0 till 99

dFrame = pd.DataFrame({'a': np.random.randn(100)})
dFrame

Unnamed: 0,a
0,0.026239
1,-3.154829
2,-1.383818
3,-1.596040
4,2.101755
...,...
95,0.168515
96,-0.365147
97,0.372911
98,1.663883


In [40]:
# Created a data-store using 'HDFStore()' function with the name 'mydata.h5' whose type is 'h5'

# pip install tables      # this command is used to install HDF tables

store = pd.HDFStore('mydata.h5')
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [41]:
# Created an obj with the name 'obj1' to store the data of Pandas DataFrame i.e. 'dFrame'

store['obj1'] = dFrame

In [42]:
# Created another obj i.e. 'obj1_col' to store only a column of Pandas DataFrame i.e. 'dFrame'

store['obj1_col'] = dFrame['a']

In [43]:
# This will print the store which is our data-store in 'h5' format and carries 'obj1' & 'obj1_col'

print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5



In [44]:
x=store.obj1         # 'obj1' of 'store' is assigned to a variable 'x'
print(type(x))       # will display type of variable 'x'
print(x.head())      # will print (select) top-five rows of variable 'x' which is in fact 'obj1' of 'store'

<class 'pandas.core.frame.DataFrame'>
          a
0  0.026239
1 -3.154829
2 -1.383818
3 -1.596040
4  2.101755


In [45]:
# Here, similar to assingming Pandas DataFrame 'dFrame' to our HDF 'store', another type of assignment is done.
# In this case, rather than creating an 'object', a table format is created in the data-store with the data of 'dFrame'
# which will be in 'table' format
# 'table' format data-store support on-the-fly data-manipulation, filtering and sorting options
# Otherwise, like other csv file, first file is loaded and then manipulated
# This is great option provided by HDF 'table' format

store.put('obj2', dFrame, format='table')
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5



In [46]:
# This will select from newly created 'obj2' all records whose index number are >= 10 and <= 15
# This will load only the selected records and not the whole database

store.select('obj2', where=['index >= 10 and index <= 15'])

Unnamed: 0,a
10,0.397975
11,-1.701793
12,-0.192751
13,1.041977
14,-0.209715
15,-1.764043


In [47]:
# This will save another object 'obj3' with 'table' format with DataFrame 'dFrame'

dFrame.to_hdf('mydata.h5', 'obj3', format='table')

In [52]:
# This will select all rows from newly saved 'obj3' WHERE row-index is less than 5

dFrame = pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])
print(dFrame)

          a
0  0.026239
1 -3.154829
2 -1.383818
3 -1.596040
4  2.101755


In [54]:
# Closing an opened data is essential, otherwise your data may be corrupted

store.close()

### Reading Microsoft Excel Data

In [55]:
# Reading
# 'ex1.xlsx' file is stored in variable 'xlsx' which is a Pandas DataFrame created using command 'pd.ExcelFile'
# 'pd.read_excel' command is used to read 'sheet1' of DataFrame 'xlsx'

# pip install xlrd      # this will install dependency to load / read MS Excel files in Pandas DataFrame

xlsx = pd.ExcelFile('examples/ex1.xlsx')
pd.read_excel(xlsx, 'Sheet1')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [56]:
# MS Excel file 'ex1.xlsx' is read and stored in DataFrame i.e 'frame'

frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
print(frame)

   Unnamed: 0  a   b   c   d message
0           0  1   2   3   4   hello
1           1  5   6   7   8   world
2           2  9  10  11  12     foo


In [57]:
# Writing to a MS Excel file format
# 'pd.ExcelWriter' function is used to write MS Excel file 'ex2.xlsx' and stored in variable 'writer'
# 'to_excel()' function is used to write 'frame' DataFrame contents to 'writer'

# pip install openpyxl

writer = pd.ExcelWriter('examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

In [58]:
# alternative option for single sheet

frame.to_excel('examples/ex2.xlsx')
frame

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


### Interacting with Web APIs

In [59]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
response = requests.get(url)

# print(response)
data = response.json()

In [60]:
# This will display the type of variable 'data' which is 'list'

print(type(data))

<class 'list'>


In [61]:
# print(data)  # this will print the whole data fetched through API from the github API Link

data_dict = data[0]      #  this will create a data_dict from index-0 of variable 'data'

data_dict    # this will print the whole dictionery

{'url': 'https://api.github.com/repos/pandas-dev/pandas/issues/35643',
 'repository_url': 'https://api.github.com/repos/pandas-dev/pandas',
 'labels_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/35643/labels{/name}',
 'comments_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/35643/comments',
 'events_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/35643/events',
 'html_url': 'https://github.com/pandas-dev/pandas/pull/35643',
 'id': 675693396,
 'node_id': 'MDExOlB1bGxSZXF1ZXN0NDY1MTQ3NDQz',
 'number': 35643,
 'title': 'ENH: Styler tooltips feature',
 'user': {'login': 'attack68',
  'id': 24256554,
  'node_id': 'MDQ6VXNlcjI0MjU2NTU0',
  'avatar_url': 'https://avatars0.githubusercontent.com/u/24256554?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/attack68',
  'html_url': 'https://github.com/attack68',
  'followers_url': 'https://api.github.com/users/attack68/followers',
  'following_url': 'https://api.github.com/users/attack68/f

In [62]:
# This will print 'title' column of 'data_dict'

data_dict['title']

'ENH: Styler tooltips feature'

In [63]:
# this will print the 'keys' of 'data_dict'

print(data_dict.keys())

dict_keys(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'performed_via_github_app'])


In [64]:
# A DataFrame 'issues' is created from data (JSON object) with required columns

issues = pd.DataFrame(data, 
                    columns=['number', 'title','labels', 'state'])

issues.head()   # This will print top-five rows of DataFrame i.e. 'issues'

Unnamed: 0,number,title,labels,state
0,35643,ENH: Styler tooltips feature,[],open
1,35642,BUG:Resample with groupby & agg(),"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
2,35641,REF/PERF: Move MultiIndex._tuples to MultiInde...,[],open
3,35640,DOC: Add specific Visual Studio Installer inst...,[],open
4,35639,BUG: RollingGroupby with closed and column sel...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open


In [65]:
# This DataFrame i.e. 'issues2' is created without specifying custom column-names

issues2 = pd.DataFrame(data)

issues2['body'].head()   # this will print top-five rows of 'body' column of DataFrame i.e 'issues2'

0    - [ ] closes #21266 \r\n- [ ] tests added / pa...
1    - [ X] I have checked that this issue has not ...
2    Currently, the heavy-to-calculate ``MultiIndex...
3    - [ ] further improves #28316 - Currently the ...
4    - [x] closes #35549\r\n- [x] tests added / pas...
Name: body, dtype: object

In [66]:
# This will print top-five rows of only three columns of DataFrame i.e 'issues2'

print(issues2[['number', 'title','state']].head())

   number                                              title state
0   35643                       ENH: Styler tooltips feature  open
1   35642                  BUG:Resample with groupby & agg()  open
2   35641  REF/PERF: Move MultiIndex._tuples to MultiInde...  open
3   35640  DOC: Add specific Visual Studio Installer inst...  open
4   35639  BUG: RollingGroupby with closed and column sel...  open


### Interacting with Databases

In [67]:
import sqlite3

# creating database file
con = sqlite3.connect('mydata.sqlite')

In [None]:
# creating table
query = """ CREATE TABLE test(a VARCHAR(20), 
        b VARCHAR(20),
        c REAL,
        d INTEGER);  """

# running query
con.execute(query)



In [69]:
# table created command commit

con.commit()

# done

In [70]:
# inserting data

# preparing data
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)
       ]

In [71]:
print(data)

[('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5)]


In [72]:
# preparing insert statement

insStatement = "INSERT INTO test VALUES(?, ?, ?, ?)"

In [73]:
# running insert statement

con.executemany(insStatement, data)

<sqlite3.Cursor at 0x8ff1286c00>

In [74]:
# finalizing data saving

con.commit()

In [79]:
# selecting / extracting data

cursor = con.execute('select * from test')

rows = cursor.fetchall()

print(type(rows), rows)

<class 'list'> [('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5), ('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5)]


In [80]:
print(cursor.description)

(('a', None, None, None, None, None, None), ('b', None, None, None, None, None, None), ('c', None, None, None, None, None, None), ('d', None, None, None, None, None, None))


In [81]:
data_df = pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

data_df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
3,Atlanta,Georgia,1.25,6
4,Tallahassee,Florida,2.6,3
5,Sacramento,California,1.7,5


In [83]:
# pip install sqlalchemy
# conda install -c anaconda sqlalchemy 
# using sqlalchemy

import sqlalchemy as sqla

db = sqla.create_engine('sqlite:///mydata.sqlite')

data_df =pd.read_sql('select * from test', db)

data_df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
3,Atlanta,Georgia,1.25,6
4,Tallahassee,Florida,2.6,3
5,Sacramento,California,1.7,5


#### At this point, Practice of Pandas Lesson-3 of PIAIC Student Portal is completed.