### 1. StringIO
### 2. Pandas read_csv

In [1]:
from io import StringIO
import pandas as pd

In [2]:
df = pd.read_csv('mercedesbenz.csv')
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [3]:
type(df)

pandas.core.frame.DataFrame

In [12]:
data = ("col1,col2,col3\n"
        'x,y,1\n'
        'a,b,2\n'
        'c,d,3\n')

In [13]:
type(data)

str

In [14]:
## In memory file format object
StringIO(data)

<_io.StringIO at 0x2300b4c32e0>

In [18]:
# Convert a string intoa data frame using StringIO with pd.read_csv()
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [19]:
pd.read_csv(StringIO(data), usecols=['col1','col2'])

Unnamed: 0,col1,col2
0,x,y
1,a,b
2,c,d


### Selecting reqquired columns while reading data

In [20]:
df = pd.read_csv('mercedesbenz.csv', usecols=['X0', 'X1','X2','X3','X4','X5','X6','X8'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [23]:
## Save CSV
## Always remove index while saving csv
df.to_csv('test.csv',index=False)

In [24]:
## Datatypes in csv
data = ("a,b,c,d\n"
        '1,2,3,4\n'
        '5,6,7,8\n'
        '9,10,11\n')

In [27]:
df = pd.read_csv(StringIO(data))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int64  
 1   b       3 non-null      int64  
 2   c       3 non-null      int64  
 3   d       2 non-null      float64
dtypes: float64(1), int64(3)
memory usage: 228.0 bytes


### Changing datatype for entire data while reading data

In [30]:
df = pd.read_csv(StringIO(data),dtype='object')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       3 non-null      object
 1   b       3 non-null      object
 2   c       3 non-null      object
 3   d       2 non-null      object
dtypes: object(4)
memory usage: 228.0+ bytes


In [31]:
df.head()

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [32]:
df.isnull().sum()

a    0
b    0
c    0
d    1
dtype: int64

In [33]:
df['a'][1]

'5'

In [35]:
## Datatypes in csv
data1 = ("a,b,c,d\n"
        '1,2,3,4\n'
        '5,6,7,8\n'
        '9,10,11\n')

### Changing datatypes for each columns while reading data

In [36]:

df = pd.read_csv(StringIO(data1),dtype={'a':int,'b':float,'c':int})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int32  
 1   b       3 non-null      float64
 2   c       3 non-null      int32  
 3   d       2 non-null      float64
dtypes: float64(2), int32(2)
memory usage: 204.0 bytes


In [48]:
## Datatypes in csv
data2 = ("a,b,c\n"
        '4,apple,bat\n'
        '8,orange,cow\n')

### Selecting an index from our data while reading data

In [49]:
pd.read_csv(StringIO(data2), index_col=0)

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
4,apple,bat
8,orange,cow



### Using index_cols and usecols together

In [53]:
pd.read_csv(StringIO(data2),usecols=['a','b','c'],index_col=0)

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
4,apple,bat
8,orange,cow


In [2]:
## Need to check why is it ne reading data from URL. The sep='\t' parameter is for tab saperated data.
import pandas as pd
pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',sep='\t')

HTTPError: HTTP Error 403: Forbidden