# Load csv file with pure Python

In [6]:
csv_content = []
with open("sample.csv") as f:
    for line in f:
        line_components = line.strip().split(",")
        csv_content.append(line_components)

In [7]:
csv_content

[['name', 'age', 'city', 'occupation', 'salary'],
 ['Alice Johnson', '28', 'New York', 'Software Engineer', '85000'],
 ['Bob Smith', '34', 'San Francisco', 'Data Scientist', '95000'],
 ['Carol Davis', '29', 'Boston', 'Product Manager', '75000'],
 ['David Wilson', '31', 'Seattle', 'UX Designer', '70000'],
 ['Eva Brown', '26', 'Austin', 'Marketing Specialist', '60000']]

# Pandas basics

In [8]:
import pandas as pd

In [9]:
csv_df = pd.read_csv("sample.csv")

In [10]:
csv_df

Unnamed: 0,name,age,city,occupation,salary
0,Alice Johnson,28,New York,Software Engineer,85000
1,Bob Smith,34,San Francisco,Data Scientist,95000
2,Carol Davis,29,Boston,Product Manager,75000
3,David Wilson,31,Seattle,UX Designer,70000
4,Eva Brown,26,Austin,Marketing Specialist,60000


In [11]:
csv_df['name']

0    Alice Johnson
1        Bob Smith
2      Carol Davis
3     David Wilson
4        Eva Brown
Name: name, dtype: object

In [13]:
csv_df['age']

0    28
1    34
2    29
3    31
4    26
Name: age, dtype: int64

# Other separators

In [33]:
pd.read_csv("sample_broken.csv")

Unnamed: 0,name,age,city,occupation,salary
Alice Johnson,28,New York,Software Engineer,Data Scientist,85000.0
Bob Smith,34,San Francisco,Data Scientist,95000,
Davis,Carol,29,Boston,Product Manager,75000.0
David Wilson,31,Seattle,UX Designer,70000,
Eva Brown,26,Austin,Marketing Specialist,60000,


In [26]:
csv_df.to_csv("sample_pipe.csv", sep="|", index=None)

In [27]:
pd.read_csv("sample_pipe.csv")

Unnamed: 0,name|age|city|occupation|salary
0,Alice Johnson|28|New York|Software Engineer|85000
1,Bob Smith|34|San Francisco|Data Scientist|95000
2,Carol Davis|29|Boston|Product Manager|75000
3,David Wilson|31|Seattle|UX Designer|70000
4,Eva Brown|26|Austin|Marketing Specialist|60000


In [28]:
pd.read_csv("sample_pipe.csv", sep="|")

Unnamed: 0,name,age,city,occupation,salary
0,Alice Johnson,28,New York,Software Engineer,85000
1,Bob Smith,34,San Francisco,Data Scientist,95000
2,Carol Davis,29,Boston,Product Manager,75000
3,David Wilson,31,Seattle,UX Designer,70000
4,Eva Brown,26,Austin,Marketing Specialist,60000


In [29]:
csv_df.to_csv("sample_t.tsv", sep='\t', index=None)

In [32]:
pd.read_csv("sample_t.tsv")

Unnamed: 0,name\tage\tcity\toccupation\tsalary
0,Alice Johnson\t28\tNew York\tSoftware Engineer...
1,Bob Smith\t34\tSan Francisco\tData Scientist\t...
2,Carol Davis\t29\tBoston\tProduct Manager\t75000
3,David Wilson\t31\tSeattle\tUX Designer\t70000
4,Eva Brown\t26\tAustin\tMarketing Specialist\t6...


In [34]:
pd.read_csv("sample_t.tsv", sep="\t")

Unnamed: 0,name,age,city,occupation,salary
0,Alice Johnson,28,New York,Software Engineer,85000
1,Bob Smith,34,San Francisco,Data Scientist,95000
2,Carol Davis,29,Boston,Product Manager,75000
3,David Wilson,31,Seattle,UX Designer,70000
4,Eva Brown,26,Austin,Marketing Specialist,60000


# Data format

In [38]:
fips_df = pd.read_csv("fips.csv")

In [41]:
fips_df.dtypes

state     object
county    object
fips       int64
dtype: object

In [42]:
fips_df

Unnamed: 0,state,county,fips
0,Alabama,Autauga,1001
1,Alabama,Baldwin,1003
2,Alabama,Barbour,1005
3,Alabama,Bibb,1007


In [44]:
fips_str_df = pd.read_csv("fips.csv", dtype={"fips": "str"})

In [45]:
fips_str_df.dtypes

state     object
county    object
fips      object
dtype: object

In [46]:
fips_str_df

Unnamed: 0,state,county,fips
0,Alabama,Autauga,1001
1,Alabama,Baldwin,1003
2,Alabama,Barbour,1005
3,Alabama,Bibb,1007


# Gzip

In [49]:
fips_str_df.to_csv("fips.csv.gz", compression='gzip', index=None)

In [51]:
pd.read_csv("fips.csv.gz", compression='gzip', dtype={"fips": "str"})

Unnamed: 0,state,county,fips
0,Alabama,Autauga,1001
1,Alabama,Baldwin,1003
2,Alabama,Barbour,1005
3,Alabama,Bibb,1007


# Parquet

In [54]:
fips_str_df.to_parquet("fips.parquet")

In [55]:
pd.read_parquet("fips.parquet")

Unnamed: 0,state,county,fips
0,Alabama,Autauga,1001
1,Alabama,Baldwin,1003
2,Alabama,Barbour,1005
3,Alabama,Bibb,1007


# Tabular to JSON

In [61]:
print(fips_str_df.to_json(orient="records", indent=2))

[
  {
    "state":"Alabama",
    "county":"Autauga",
    "fips":"01001"
  },
  {
    "state":"Alabama",
    "county":"Baldwin",
    "fips":"01003"
  },
  {
    "state":"Alabama",
    "county":"Barbour",
    "fips":"01005"
  },
  {
    "state":"Alabama",
    "county":"Bibb",
    "fips":"01007"
  }
]


# JSON to tabluar

In [62]:
import json

In [63]:
with open("sample.json") as f:
    sample_json = json.load(f)

In [64]:
sample_json

{'name': 'Alice Johnson',
 'age': 28,
 'isStudent': False,
 'height': 5.6,
 'address': {'street': '123 Main St',
  'city': 'New York',
  'zipCode': '10001',
  'coordinates': {'latitude': 40.7128, 'longitude': -74.006}},
 'hobbies': ['reading', 'cycling', 'photography'],
 'favoriteColors': ['blue', 'green'],
 'spouse': None,
 'education': {'degree': 'Master of Science',
  'field': 'Computer Science',
  'university': 'MIT',
  'graduationYear': 2020,
  'gpa': 3.85},
 'socialMedia': {'twitter': '@alice_codes',
  'linkedin': 'linkedin.com/in/alicejohnson',
  'github': 'alice-dev'},
 'lastLogin': '2024-03-15T14:30:00Z',
 'isActive': True}