# CSV files in Pandas

In [38]:
import pandas as pd

In [39]:
# Making an example dataframe to work with
my_dict = { 'name' : ["a", "b", "c", "d", "e","f", "g"],
                   'age' : [20,27, 35, 55, 18, 21, 35],
                   'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]}

df = pd.DataFrame(my_dict)
df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


### Writing the dataframe to a csv file

In [40]:
df.to_csv("csv_example_with_index")

The resulting file
___

,name,age,designation  
0,a,20,VP  
1,b,27,CEO  
2,c,35,CFO  
3,d,55,VP  
4,e,18,VP  
5,f,21,CEO  
6,g,35,MD  

Exclude writing the index with `index=False`

In [41]:
df.to_csv("csv_example", index=False)

The resulting file  
___

name,age,designation  
a,20,VP  
b,27,CEO  
c,35,CFO  
d,55,VP  
e,18,VP  
f,21,CEO  
g,35,MD  

### Creating dataframe from csv

In [42]:
df = pd.read_csv("csv_example")
df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


> Pandas Automatically adds an index when reading a csv.  
> Since our CSV file already had an index, it's been doubled up

By setting `index_col` we can tell pandas to use the index column in the csv file and NOT create a new one

In [43]:
df = pd.read_csv("csv_example_with_index", index_col=0)
df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


### Setting the header with `header=`

In [44]:
df = pd.read_csv("csv_example", header=None)
df

Unnamed: 0,0,1,2
0,name,age,designation
1,a,20,VP
2,b,27,CEO
3,c,35,CFO
4,d,55,VP
5,e,18,VP
6,f,21,CEO
7,g,35,MD


In [45]:
df = pd.read_csv("csv_example", header=1)
df

Unnamed: 0,a,20,VP
0,b,27,CEO
1,c,35,CFO
2,d,55,VP
3,e,18,VP
4,f,21,CEO
5,g,35,MD


In [46]:
df = pd.read_csv("csv_example", header=[0,1,3])
df

Unnamed: 0_level_0,name,age,designation
Unnamed: 0_level_1,a,20,VP
Unnamed: 0_level_2,c,35,CFO
0,d,55,VP
1,e,18,VP
2,f,21,CEO
3,g,35,MD


### Assigning column names

In [47]:
df = pd.read_csv('csv_example', names=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,name,age,designation
1,a,20,VP
2,b,27,CEO
3,c,35,CFO
4,d,55,VP
5,e,18,VP
6,f,21,CEO
7,g,35,MD


In [48]:
df = pd.read_csv('csv_example', names=['a', 'b', 'c'], header=0)
df

Unnamed: 0,a,b,c
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


> Use the `sep` parameter to specify the delimeter of the csv file

### Set_index

In [49]:
df

Unnamed: 0,a,b,c
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [50]:
df.set_index("a")

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
a,20,VP
b,27,CEO
c,35,CFO
d,55,VP
e,18,VP
f,21,CEO
g,35,MD


using `nrows`

In [57]:
# Load Only 3 Rows
df_csv = pd.read_csv('csv_example', nrows=3)

df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO


including blank lines in the csv file

In [56]:
df_csv = pd.read_csv('csv_example', skip_blank_lines=False)
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD
