# Python Pandas library part - 2

In part we are going to learn about 

1. StringIO
2. Pandas read_csv

In [4]:
### Reading Different Data sources with the help of pandas ###
from io import StringIO # In-memory file object

In [5]:
import pandas as pd

In [7]:
df = pd.read_csv('mercedesbenz.csv')

In [8]:
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [10]:
data = ('col1,col2,col3\n'
       'x,y,1\n'
       'a,b,2\n'
       'c,d,3')

In [13]:
type(data)

str

In [15]:
# in-memory file format object
StringIO(data)

<_io.StringIO at 0x21d65bd3550>

In [17]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [18]:
# select only col1, and col2 info
pd.read_csv(StringIO(data), usecols=['col1', 'col2'])

Unnamed: 0,col1,col2
0,x,y
1,a,b
2,c,d


In [23]:
import pandas as pd
# select only 'X0', 'X1', 'X2', 'X3', 'X4', 'X5' cols details
df = pd.read_csv('mercedesbenz.csv', usecols=['X0', 'X1', 'X2', 'X3', 'X4', 'X5'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5
0,k,v,at,a,d,u
1,k,t,av,e,d,y
2,az,w,n,c,d,x
3,az,t,n,f,d,x
4,az,v,n,f,d,h


In [24]:
# Save above dataframe into csv file
df.to_csv('test.csv')

In [25]:
# without indexing in csv file
df.to_csv('test_without_index.csv', index=False)

In [26]:
# datatypes in csv
data = ('a,b,c,d\n'
       '1,2,3,4\n'
       '5,6,7,8\n'
       '9,10,11')

In [27]:
df = pd.read_csv(StringIO(data))

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int64  
 1   b       3 non-null      int64  
 2   c       3 non-null      int64  
 3   d       2 non-null      float64
dtypes: float64(1), int64(3)
memory usage: 224.0 bytes


In [34]:
# coverting datatype (dtype) into object
# object type means string
df = pd.read_csv(StringIO(data), dtype='object')

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       3 non-null      object
 1   b       3 non-null      object
 2   c       3 non-null      object
 3   d       2 non-null      object
dtypes: object(4)
memory usage: 224.0+ bytes


In [36]:
df.head()

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [38]:
df.isnull().sum()

a    0
b    0
c    0
d    1
dtype: int64

In [40]:
# select 1st col details
df['a']

0    1
1    5
2    9
Name: a, dtype: object

In [41]:
# select 1st col 1st data 
df['a'][0]

'1'

In [42]:
# datatypes in csv
data = ('a,b,c,d\n'
       '1,2,3,4\n'
       '5,6,7,8\n'
       '9,10,11')

In [48]:
# change data type of cols
# for 'a' int, for 'b' float, for 'c' int
df = pd.read_csv(StringIO(data), dtype={'a': int, 'b': float, 'c': int})

In [49]:
df

Unnamed: 0,a,b,c,d
0,1,2.0,3,4.0
1,5,6.0,7,8.0
2,9,10.0,11,


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int32  
 1   b       3 non-null      float64
 2   c       3 non-null      int32  
 3   d       2 non-null      float64
dtypes: float64(2), int32(2)
memory usage: 200.0 bytes


In [51]:
df.dtypes

a      int32
b    float64
c      int32
d    float64
dtype: object

In [56]:
data = ('index,a,b,c\n'
       '4,apple,bat,5.7\n'
       '8,orange,cow,10')

In [59]:
df = pd.read_csv(StringIO(data),index_col=0)

In [60]:
df

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [61]:
# use index cols and usecols
data = ('a,b,c\n'
       'apple,bat,5.7\n'
       'orange,cow,10')


In [71]:
# Note: index_col should be from usecols only
df = pd.read_csv(StringIO(data), usecols=['a', 'b', 'c'], index_col='b')

In [73]:
df

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
bat,apple,5.7
cow,orange,10.0


In [74]:
df = pd.read_csv(StringIO(data), usecols=['a', 'b', 'c'], index_col=0)

In [75]:
df

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
apple,bat,5.7
orange,cow,10.0


In [81]:
pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',sep='\t')

HTTPError: HTTP Error 403: Forbidden

In [None]:
pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',sep='\t').to_csv('result.csv')