In [2]:
### Reading data from various sources using pandas
import pandas as pd

In [3]:
# reading a JSON file
from io import StringIO
data='{"employee_name":"James","email":"james@gmail.com","job_profile":[{"title1":"Team Lead","title2":"Senior Developer"}]}'
print(type(data))

<class 'str'>


In [4]:
# convert the data into JSON (string)
type(StringIO(data))

_io.StringIO

In [9]:
# read the JSON file
df=pd.read_json(StringIO(data))
df

Unnamed: 0,employee_name,email,job_profile
0,James,james@gmail.com,"{'title1': 'Team Lead', 'title2': 'Senior Deve..."


In [11]:
# converting back to JSON
df.to_json()
# you see the orient='column' by default takes this format '{'column_name':{'row_index':'value'}}'

'{"employee_name":{"0":"James"},"email":{"0":"james@gmail.com"},"job_profile":{"0":{"title1":"Team Lead","title2":"Senior Developer"}}}'

In [12]:
# so to take each row as a JSON object(dictionary) set, orient='records'
# this will set each row as a JSON object and the whole thing will be in a list
df.to_json(orient='records')

'[{"employee_name":"James","email":"james@gmail.com","job_profile":{"title1":"Team Lead","title2":"Senior Developer"}}]'

In [15]:
# reading a wine dataset from an url, but even the url is in CSV(comma separated values) format
df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None)
df
# if you do not provide header=None, the first row values will incorrectly be taken as column titles

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [None]:
df.to_csv("wine.csv") # converting the df back into csv

In [20]:
# reading from a website or a html file
# libraries-> lxml,html5lib,beautifulsoup4

In [None]:
url="https://www.fdic.gov/bank-failures/failed-bank-list"
df=pd.read_html(url) # reading from the url and converting into dataframe

In [None]:
df  
# the entire table is in a list, and to access the table do df[0]

[                               Bank Name           City         State   Cert  \
 0           The Santa Anna National Bank     Santa Anna         Texas   5520   
 1                   Pulaski Savings Bank        Chicago      Illinois  28611   
 2     The First National Bank of Lindsay        Lindsay      Oklahoma   4134   
 3  Republic First Bank dba Republic Bank   Philadelphia  Pennsylvania  27332   
 4                          Citizens Bank       Sac City          Iowa   8758   
 5               Heartland Tri-State Bank        Elkhart        Kansas  25851   
 6                    First Republic Bank  San Francisco    California  59017   
 7                         Signature Bank       New York      New York  57053   
 8                    Silicon Valley Bank    Santa Clara    California  24735   
 9                      Almena State Bank         Almena        Kansas  15426   
 
                  Acquiring Institution      Closing Date  Fund  Sort ascending  
 0            Coleman Cou

In [24]:
df[0] # now you can see the table, without any list

Unnamed: 0,Bank Name,City,State,Cert,Acquiring Institution,Closing Date,Fund Sort ascending
0,The Santa Anna National Bank,Santa Anna,Texas,5520,Coleman County State Bank,"June 27, 2025",10549
1,Pulaski Savings Bank,Chicago,Illinois,28611,Millennium Bank,"January 17, 2025",10548
2,The First National Bank of Lindsay,Lindsay,Oklahoma,4134,"First Bank & Trust Co., Duncan, OK","October 18, 2024",10547
3,Republic First Bank dba Republic Bank,Philadelphia,Pennsylvania,27332,"Fulton Bank, National Association","April 26, 2024",10546
4,Citizens Bank,Sac City,Iowa,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545
5,Heartland Tri-State Bank,Elkhart,Kansas,25851,"Dream First Bank, N.A.","July 28, 2023",10544
6,First Republic Bank,San Francisco,California,59017,"JPMorgan Chase Bank, N.A.","May 1, 2023",10543
7,Signature Bank,New York,New York,57053,"Flagstar Bank, N.A.","March 12, 2023",10540
8,Silicon Valley Bank,Santa Clara,California,24735,First Citizens Bank & Trust Company,"March 10, 2023",10539
9,Almena State Bank,Almena,Kansas,15426,Equity Bank,"October 23, 2020",10538


In [25]:
# reading from a wikipedia page, where no of tables are present
url="https://en.wikipedia.org/wiki/Mobile_country_code"
# so you wil here give a match="..." where you will specify some column name which matches with the table you want to extract from the column

In [None]:
df=pd.read_html(url,match="Country") # see that there is a matching column name 'Country' in the table I want to extract

In [29]:
df[0]

Unnamed: 0,Mobile country code,Country,ISO 3166,Mobile network codes,National MNC authority,Remarks
0,289,A Abkhazia,GE-AB,List of mobile network codes in Abkhazia,,MCC is not listed by ITU
1,412,Afghanistan,AF,List of mobile network codes in Afghanistan,,
2,276,Albania,AL,List of mobile network codes in Albania,,
3,603,Algeria,DZ,List of mobile network codes in Algeria,,
4,544,American Samoa (United States of America),AS,List of mobile network codes in American Samoa,,
...,...,...,...,...,...,...
247,452,Vietnam,VN,List of mobile network codes in the Vietnam,,
248,543,W Wallis and Futuna,WF,List of mobile network codes in Wallis and Futuna,,
249,421,Y Yemen,YE,List of mobile network codes in the Yemen,,
250,645,Z Zambia,ZM,List of mobile network codes in Zambia,,


In [None]:
# if we want another table, you can do keyword match with that table
df2=pd.read_html(url,match="Operator")
df2  # gives a list of all tables having 'Operator' keyword

[   MCC  MNC Brand      Operator       Status Bands (MHz)  \
 0    1    1  TEST  Test network  Operational         any   
 1    1    1  TEST  Test network  Operational         any   
 2  999   99   NaN  Internal use  Operational         any   
 3  999  999   NaN  Internal use  Operational         any   
 
                               References and notes  
 0                                              NaN  
 1                                              NaN  
 2  Internal use in private networks, no roaming[6]  
 3  Internal use in private networks, no roaming[6]  ,
      MCC  MNC     Brand                  Operator           Status  \
 0    901    1       NaN                   Webbing          Unknown   
 1    901    2       NaN            GlobalmatiX AG          Unknown   
 2    901    3   Iridium                       NaN      Operational   
 3    901    4       NaN  BBIX Singapore Pte. Ltd.          Unknown   
 4    901    5       NaN      Thuraya RMSS Network      Operational

In [36]:
df2[0] # accessing the first table from the list

Unnamed: 0,MCC,MNC,Brand,Operator,Status,Bands (MHz),References and notes
0,1,1,TEST,Test network,Operational,any,
1,1,1,TEST,Test network,Operational,any,
2,999,99,,Internal use,Operational,any,"Internal use in private networks, no roaming[6]"
3,999,999,,Internal use,Operational,any,"Internal use in private networks, no roaming[6]"


In [37]:
# accessing the second table from the list
df2[1]

Unnamed: 0,MCC,MNC,Brand,Operator,Status,Bands (MHz),References and notes
0,901,1,,Webbing,Unknown,MVNO,Former ICO Satellite Management[51][52]
1,901,2,,GlobalmatiX AG,Unknown,Unknown,Former Sense Communications International; veh...
2,901,3,Iridium,,Operational,Satellite,
3,901,4,,BBIX Singapore Pte. Ltd.,Unknown,Unknown,Former Globalstar[54]
4,901,5,,Thuraya RMSS Network,Operational,Satellite,
...,...,...,...,...,...,...,...
99,902,1,,MulteFire Alliance,Operational,LTE,[6][126]
100,991,1,,World's Global Telecom,Not operational,Unknown,temporarily assigned until 15 January 2021[104...
101,991,2,5G Croco,Orange S.A.,Not operational,5G,temporarily assigned until 6 August 2022[128][...
102,991,3,,Halys SAS,Not operational,Unknown,temporary assignment for trial until 5 April 2...


In [38]:
# reading data from excel files
# note that excel files have -> .xlsx extension
# csv files have -> .csv extension
# library required-> openpyxl

In [43]:
excel_df=pd.read_excel("excel_data.xlsx") # reading the excel file and converting into dataframe
excel_df

Unnamed: 0,Name,Age
0,Aadrish,21
1,Toddy,23
2,Jack,30
3,John,35


In [44]:
# converting into pickle file
# pickle file is used for serializing and deserializing data

In [46]:
# converting the excel df into pickle file
excel_df.to_pickle("pickle1") # pickle file named 'pickle1' will be created