In [50]:
#list 4-1
import io
import pandas as pd

data = io.StringIO(
"""
id,age,height,weight
129237,32,5.4,126
123083,20,6.1,145
"""
)

# Corrected line to explicitly specify the separator
df = pd.read_csv(data, sep=",")

print(df)

       id  age  height  weight
0  129237   32     5.4     126
1  123083   20     6.1     145


In [52]:
#list 4-2
import io
import pandas as pd

data = io.StringIO(
"""
id| age| height| weight
129237| 32| 5.4| 126
123083| 20| 6.1| 145
"""
)

print(pd.read_csv(data, sep="|", skipinitialspace=True))

       id  age  height  weight
0  129237   32     5.4     126
1  123083   20     6.1     145


In [54]:
import io
import pandas as pd

data = io.StringIO(
    """
id,age,height,weight
129237,32,5.4,126
123083,20,6.1,145
"""
)

# Read CSV and specify columns to use
df = pd.read_csv(data, usecols=["height", "age"])
print(df)

   age  height
0   32     5.4
1   20     6.1


In [56]:
import io
import pandas as pd

data = io.StringIO(
    """
student_id, grade
1045,"a"
2391,"b"
8723,"c"
1092,"a"
"""
)

# Read CSV, skipping the last row and using the 'python' engine to handle skipfooter
try:
    grades = pd.read_csv(data, skipfooter=1, engine='python')
    print(grades)
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")



   student_id  grade
0        1045      a
1        2391      b
2        8723      c


In [58]:
import io

import pandas as pd


data = io.StringIO(
    """
family,,nightshade,nightshade,nightshade
species,,tomatoe,deadly-nightshade,potato
family_id,species_id,,,
61248,129237,1,0,0
61248,123083,0,1,0
61248,123729,0,0,1
"""
)

print(pd.read_csv(data, header=[0, 1], index_col=[0, 1]))


family               nightshade                         
species                 tomatoe deadly-nightshade potato
family_id species_id                                    
61248     129237              1                 0      0
          123083              0                 1      0
          123729              0                 0      1


In [60]:
import io

import pandas as pd


site1 = io.StringIO(
    """
site1
129237
123083
"""
)

site2 = io.StringIO(
    """
129337
123583
"""
)

site_data = pd.read_csv(site1)
site_data["site2"] = pd.read_csv(site2, squeeze=True, header=None)
print(site_data)


    site1   site2
0  129237  129337
1  123083  123583


In [62]:
import io
import pandas as pd

data = io.StringIO(
    """
id,age,height,weight
129237,32,5.4,126
123083,20,6.1,145
"""
)

# Read the CSV data, setting the 'id' column as the index
df = pd.read_csv(data, index_col=[0])

# Print the DataFrame
print(df)

# Print the memory usage of the DataFrame
print(df.memory_usage(deep=True))

# Print the data types of each column
print(df.dtypes)

# Print the data type of the index
print(df.index.dtype)


        age  height  weight
id                         
129237   32     5.4     126
123083   20     6.1     145
Index     16
age       16
height    16
weight    16
dtype: int64
age         int64
height    float64
weight      int64
dtype: object
int64


In [64]:
import io

import pandas as pd
import numpy as np


data = io.StringIO(
    """
id,age,height,weight
129237,32,5.4,126
123083,20,6.1,145
"""
)

df = pd.read_csv(
    data,
    dtype={"id": np.int32, "age": np.int8, "height": np.float16, "weight": np.int16},
    index_col=[0],
)

print(df)
print(df.memory_usage(deep=True))
print(df.dtypes)
print(df.index.dtype)


        age    height  weight
id                           
129237   32  5.398438     126
123083   20  6.101562     145
Index     16
age        2
height     4
weight     4
dtype: int64
age          int8
height    float16
weight      int16
dtype: object
int64


In [66]:
import io

import pandas as pd


MEDICATIONS_MAPPER = {"atg": "atg", "aftg": "atg", "bta": "bta"}


def medication_converter(value):
    return MEDICATIONS_MAPPER[value.lower()]


data = io.StringIO(
"""
id,age,height,weight,med
129237,32,5.4,126,bta
123083,20,6.1,145,aftg
"""
)

print(pd.read_csv(data, converters={"med": medication_converter}))


       id  age  height  weight  med
0  129237   32     5.4     126  bta
1  123083   20     6.1     145  atg


In [68]:
import io

import pandas as pd


data = io.StringIO(
"""
student_id, grade
1045,"a"
2391,"b"
8723,"c"
1092,"a'
"""
)

grades = pd.read_csv(data, nrows=3)


In [70]:
import io

import pandas as pd


data_csv = io.StringIO(
"""
student_id, grade
1045,"a"
2391,"b"
8723,"c"
1092,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
"""
)

def process(data):
    """ Mocked process function. """
    data_csv.seek(0)
    return data

ROWS_PER_CHUNK = 1000

data = process(pd.read_csv(data_csv, nrows=ROWS_PER_CHUNK))
read_rows = len(data)

chunk = 1
while chunk * ROWS_PER_CHUNK == read_rows:
    chunk_data = process(
        pd.read_csv(
            data_csv,
            skiprows=chunk * ROWS_PER_CHUNK,
            nrows=ROWS_PER_CHUNK,
            header=None,
            names=data.columns,
        )
    )
    read_rows += len(chunk_data)
    data = data.append(process(chunk_data), ignore_index=True)


In [72]:
import io
import pandas as pd

data_csv = io.StringIO(
"""
student_id, grade
1045,"a"
2391,"b"
8723,"c"
1092,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
"""
)

def process(data):
    """ Mocked process function. """
    # For the purpose of this example, we'll just return the data unchanged
    return data

ROWS_PER_CHUNK = 5

# Read the first chunk of data
data = process(pd.read_csv(data_csv, nrows=ROWS_PER_CHUNK))
read_rows = len(data)

chunk = 1
while read_rows == chunk * ROWS_PER_CHUNK:
    # Read the next chunk of data
    chunk_data = process(
        pd.read_csv(
            data_csv,
            skiprows=chunk * ROWS_PER_CHUNK + 1,  # +1 to skip the header row
            nrows=ROWS_PER_CHUNK,
            header=None,
            names=data.columns,
        )
    )
    read_rows += len(chunk_data)
    data = data.append(chunk_data, ignore_index=True)
    chunk += 1

# Print the resulting DataFrame
print(data)



  student_id  grade
0       1045      a
1       2391      b
2       8723      c
3       1092      a
4       1045      a


In [74]:
import io

import pandas as pd


data = io.StringIO(
"""
student_id, grade
1045,"a"
2391,"b"
,"c"
1092,"a"
"""
)

grades = pd.read_csv(
    data, verbose=True, index_col="student_id", engine="python"
)
print(grades)
print(grades.index.dtype)


Filled 1 NA values in column student_id
            grade
student_id       
1045.0          a
2391.0          b
NaN             c
1092.0          a
float64


In [76]:
import io
import pandas as pd
import numpy as np

data = io.StringIO(
"""
id,age,height,weight
129237,32,5.4,126
123083,20,6.1,
123087,25,4.5,unknown
"""
)

# Read the CSV data, with error coercion for problematic data entries
df = pd.read_csv(data, dtype={"id": np.int32, "age": np.int8, "height": np.float16}, index_col=[0])

# Convert 'weight' column to numeric, coercing errors to NaN
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')

print(df)
print(df.memory_usage(deep=True))
print(df.dtypes)
print(df.index.dtype)



        age    height  weight
id                           
129237   32  5.398438   126.0
123083   20  6.101562     NaN
123087   25  4.500000     NaN
Index     24
age        3
height     6
weight    24
dtype: int64
age          int8
height    float16
weight    float64
dtype: object
int64


In [78]:
import io

import pandas as pd
import numpy as np


data = io.StringIO(
"""
id,age,height,weight
129237,32,5.4,126
123083,20,6.1,
123087,25,4.5,unknown
"""
)

df = pd.read_csv(
    data,
    dtype={"id": np.int32, "age": np.int8, "height": np.float16, "weight": np.float16},
    na_values={"unknown"},
    index_col=[0],
)

print(df)
print(df.memory_usage(deep=True))
print(df.dtypes)
print(df.index.dtype)

df["weight"].fillna(0, inplace=True)
df["weight"] = df["weight"].astype(np.int16)

print(df)
print(df.memory_usage(deep=True))
print(df.dtypes)


        age    height  weight
id                           
129237   32  5.398438   126.0
123083   20  6.101562     NaN
123087   25  4.500000     NaN
Index     24
age        3
height     6
weight     6
dtype: int64
age          int8
height    float16
weight    float16
dtype: object
int64
        age    height  weight
id                           
129237   32  5.398438     126
123083   20  6.101562       0
123087   25  4.500000       0
Index     24
age        3
height     6
weight     6
dtype: int64
age          int8
height    float16
weight      int16
dtype: object


In [80]:
import io

import pandas as pd


data = io.StringIO(
"""
student_id,grade
1045,"a"
2391,"b"
8723,"c"
1092,"a"
1045,"a"
1045,"a"
1045,"a"
1045,"a"
1045,"a"
1045,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
2391,"b"
1045,"a"
1045,"a"
2391,"b"
8723,"c"
1092,"a"
2391,"b"
8723,"c"
1092,"a"
8723,"c"
1092,"a"
2391,"b"
8723,"c"
1092,"a"
2391,"b"
8723,"c"
1092,"a"
2391,"b"
8723,"c"
1092,"a"
2391,"b"
8723,"c"
1092,"a"
2391,"b"
8723,"c"
1092,"a"
"""
)

grades = pd.read_csv(data, verbose=True, engine="c")


Tokenization took: 0.01 ms
Type conversion took: 0.16 ms
Parser memory cleanup took: 0.00 ms


In [82]:
import io

import pandas as pd
import numpy as np


data = io.StringIO(
"""
id,birth,height,weight
129237,04/10/1999,5.4,126
123083,07/03/2000,6.1,150
123087,11/23/1989,4.5,111
"""
)

df = pd.read_csv(
    data,
    dtype={"id": np.int32, "height": np.float16, "weight": np.int16},
    parse_dates=["birth"],
    index_col=[0],
)

print(df)
print(df.memory_usage(deep=True))
print(df.dtypes)
print(df.index.dtype)

            birth    height  weight
id                                 
129237 1999-04-10  5.398438     126
123083 2000-07-03  6.101562     150
123087 1989-11-23  4.500000     111
Index     24
birth     24
height     6
weight     6
dtype: int64
birth     datetime64[ns]
height           float16
weight             int16
dtype: object
int64


In [84]:
import io
import pandas as pd
import numpy as np

data = io.StringIO(
"""
id,birth,height,weight
129237,04/10/1999,5.4,126
123083,unknown,6.1,150
123087,11/23/1989,4.5,111
"""
)

df = pd.read_csv(
    data,
    dtype={"id": np.int32, "height": np.float16, "weight": np.int16},
    parse_dates=["birth"],
    na_values=["unknown"],
    index_col=[0],
    encoding="utf-8"
)

print(df)
print(df.memory_usage(deep=True))
print(df.dtypes)
print(df.index.dtype)

            birth    height  weight
id                                 
129237 1999-04-10  5.398438     126
123083        NaT  6.101562     150
123087 1989-11-23  4.500000     111
Index     24
birth     24
height     6
weight     6
dtype: int64
birth     datetime64[ns]
height           float16
weight             int16
dtype: object
int64
