# Working with CSV Files

In [1]:
import pandas as pd

### Reading CSV Files

In [2]:
!cat file1.csv
df = pd.read_csv('file1.csv')
print(df)

Rank,Language,L1 speakers,Total
1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
2     3           Spanish    500000000   570000000
3     4             Hindi    370000000   490000000
4     5            Arabic    290000000   422000000


In [3]:
# Tab delimited file
!cat file2.txt
df = pd.read_csv('file2.txt', sep='\t')
print(df)

Rank	Language	L1 speakers	Total
1	Mandarin Chinese	899000000	1051000000
2	English	500000000	840000000
3	Spanish	500000000	570000000
4	Hindi	370000000	490000000
5	Arabic	290000000	422000000
   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
2     3           Spanish    500000000   570000000
3     4             Hindi    370000000   490000000
4     5            Arabic    290000000   422000000


In [4]:
# CSV file with no headers
!cat file3.csv
df = pd.read_csv('file3.csv', header=None)
print(df)

1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
   0                 1          2           3
0  1  Mandarin Chinese  899000000  1051000000
1  2           English  500000000   840000000
2  3           Spanish  500000000   570000000
3  4             Hindi  370000000   490000000
4  5            Arabic  290000000   422000000


In [5]:
# Add headers to no header CSV file
!cat file3.csv
df = pd.read_csv('file3.csv', 
                 names=['Rank', 'Language', 'L1 speakers', 'Total'])
print(df)

1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
2     3           Spanish    500000000   570000000
3     4             Hindi    370000000   490000000
4     5            Arabic    290000000   422000000


In [6]:
# Set one CSV column as DataFrame index
!cat file1.csv
df = pd.read_csv('file1.csv', index_col='Rank')
print(df)

Rank,Language,L1 speakers,Total
1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
              Language  L1 speakers       Total
Rank                                           
1     Mandarin Chinese    899000000  1051000000
2              English    500000000   840000000
3              Spanish    500000000   570000000
4                Hindi    370000000   490000000
5               Arabic    290000000   422000000


In [7]:
# Skip rows of CSV file
!cat file4.csv
df = pd.read_csv('file4.csv', skiprows=2)
print(df)

Top 5 languages by total number of speakers,,,
https://en.wikipedia.org/wiki/List_of_languages_by_total_number_of_speakers,,,
Rank,Language,L1 speakers,Total
1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
2     3           Spanish    500000000   570000000
3     4             Hindi    370000000   490000000
4     5            Arabic    290000000   422000000


In [8]:
# CSV with missing values
!cat file5.csv
df = pd.read_csv('file5.csv')
print(df)
# By default the following values are interpreted as NaN: 
#‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, 
#‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’`.

Rank,Language,L1 speakers,Total
1,Mandarin Chinese,N/A,1051000000
2,English,NA,840000000
3,Spanish,NULL,570000000
4,Hindi,N/A,490000000
5,Arabic,290000000,422000000
   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese          NaN  1051000000
1     2           English          NaN   840000000
2     3           Spanish          NaN   570000000
3     4             Hindi          NaN   490000000
4     5            Arabic  290000000.0   422000000


In [10]:
# Read a small number of rows
!cat file1.csv
df = pd.read_csv('file1.csv', nrows=3)
print(df)

Rank,Language,L1 speakers,Total
1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
2     3           Spanish    500000000   570000000


In [11]:
# Read in chunks for large CSV files
!cat file1.csv
reader = pd.read_csv('file1.csv', chunksize=2)
for chunck in reader:
    print(chunck)
# The return value is an iterable TextFileReader object.

Rank,Language,L1 speakers,Total
1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
   Rank Language  L1 speakers      Total
2     3  Spanish    500000000  570000000
3     4    Hindi    370000000  490000000
   Rank Language  L1 speakers      Total
4     5   Arabic    290000000  422000000


### Writing CSV Files

In [12]:
# With DataFrame index
df = pd.read_csv('file1.csv')
print(df)
df.to_csv('file_out1.csv')
!cat file_out1.csv

   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
2     3           Spanish    500000000   570000000
3     4             Hindi    370000000   490000000
4     5            Arabic    290000000   422000000
,Rank,Language,L1 speakers,Total
0,1,Mandarin Chinese,899000000,1051000000
1,2,English,500000000,840000000
2,3,Spanish,500000000,570000000
3,4,Hindi,370000000,490000000
4,5,Arabic,290000000,422000000


In [14]:
# No DataFrame index
df = pd.read_csv('file1.csv')
print(df)
df.to_csv('file_out2.csv', index=False)
!cat file_out2.csv

   Rank          Language  L1 speakers       Total
0     1  Mandarin Chinese    899000000  1051000000
1     2           English    500000000   840000000
2     3           Spanish    500000000   570000000
3     4             Hindi    370000000   490000000
4     5            Arabic    290000000   422000000
Rank,Language,L1 speakers,Total
1,Mandarin Chinese,899000000,1051000000
2,English,500000000,840000000
3,Spanish,500000000,570000000
4,Hindi,370000000,490000000
5,Arabic,290000000,422000000
