# Fix Data Types
- Helps With **memory optimization**

In [9]:
import pandas as pd

df = pd.read_csv('data_MPG.csv', encoding='ascii')

In [10]:
# Lets see 10 random samples

print(df.sample(10))

      mpg  cylinders  displacement  horsepower  weight  acceleration  \
246  32.8          4          78.0        52.0    1985          19.4   
115  15.0          8         350.0       145.0    4082          13.0   
284  20.6          6         225.0       110.0    3360          16.6   
214  13.0          8         302.0       130.0    3870          15.0   
204  32.0          4          85.0        70.0    1990          17.0   
217  30.0          4         111.0        80.0    2155          14.8   
146  28.0          4          90.0        75.0    2125          14.5   
122  24.0          4         121.0       110.0    2660          14.0   
210  19.0          6         156.0       108.0    2930          15.5   
276  21.6          4         121.0       115.0    2795          15.7   

     model_year  origin                     name  
246          78   japan         mazda glc deluxe  
115          73     usa  chevrolet monte carlo s  
284          79     usa            dodge aspen 6  
214

### This dataset is widely used in data science and machine learning for regression and exploratory data analysis (EDA).

---

### Columns in the dataset:

1. **mpg**
   * Miles per gallon → the fuel efficiency of the car.
   * Higher values = better fuel economy.
   * This is usually treated as the **target variable** in regression tasks.



2. **cylinders**
   * Number of engine cylinders.
   * Common values: 3, 4, 5, 6, 8.
   * Higher cylinders generally mean more power but worse fuel economy.



3. **displacement**
   * Engine displacement in cubic inches (ci).
   * Larger displacement = larger engine size.
   * Typically correlates with horsepower and weight.



4. **horsepower**
   * Horsepower of the car (engine power).
   * Higher horsepower = faster acceleration, lower mpg usually.


5. **weight**
   * Vehicle weight (in pounds).
   * Heavier cars generally consume more fuel (lower mpg).


6. **acceleration**
   * Time taken to go from 0 to 60 mph (in seconds).
   * Lower values = faster acceleration.
   * Influenced by horsepower and weight.


7. **model\_year**
   * Last two digits of the car’s model year (70 = 1970, 81 = 1981).
   * Useful for observing trends over time (fuel efficiency improved in later years).


8. **origin**
   * Region where the car was manufactured:

     * 1 or `"usa"` → American cars
     * 2 or `"europe"` → European cars
     * 3 or `"japan"` → Japanese cars
   * Often used to compare car design philosophies across regions.

9. **name**
   * The full car name (make + model).
   * Example: `"ford thunderbird"`, `"mazda 626"`.

---


## Any duplicates: If found remove it.

In [11]:
# 1) Check for duplicate rows in df

num_duplicates = df.duplicated().sum()

print(num_duplicates)

0


In [12]:
#2) If found verify that they are duplicates.
# The boolean mask identifies all rows that have duplicates elsewhere

duplicate_rows = df[df.duplicated(keep=False)]

print(duplicate_rows)

Empty DataFrame
Columns: [mpg, cylinders, displacement, horsepower, weight, acceleration, model_year, origin, name]
Index: []


In [13]:
# 3) Keep the last duplicate occurrence

df.drop_duplicates(keep='last', inplace=True)

# Missing values

In [14]:
# step1: null values ?

print(df.isnull().sum())

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64


In [15]:
# step2: show me count of rows, columns

print(df.shape)

(398, 9)


In [16]:
# step3: percentage of data missing:

(6/398) * 100 

1.507537688442211

### Observation: Less than 2% are missing...so we can drop these...but I would not

In [17]:
# step4: I want to see which values are missing

print(df[df["horsepower"].isnull()])

      mpg  cylinders  displacement  horsepower  weight  acceleration  \
32   25.0          4          98.0         NaN    2046          19.0   
126  21.0          6         200.0         NaN    2875          17.0   
330  40.9          4          85.0         NaN    1835          17.3   
336  23.6          4         140.0         NaN    2905          14.3   
354  34.5          4         100.0         NaN    2320          15.8   
374  23.0          4         151.0         NaN    3035          20.5   

     model_year  origin                  name  
32           71     usa            ford pinto  
126          74     usa         ford maverick  
330          80  europe  renault lecar deluxe  
336          80     usa    ford mustang cobra  
354          81  europe           renault 18i  
374          82     usa        amc concord dl  


In [18]:
# step5: I will fill in the missing value with mean 

mean = df['horsepower'].mean()
df['horsepower'] = df['horsepower'].fillna(mean) # mean

print(df.isnull().sum())

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64


In [19]:
# (OPTIONAL): Other options

# STEP2:
# options to handle null values:

# Removal:
# df = df.dropna() # option rows with null values

# drop columns with NaN, if this column contains lots of Null
# df = df.dropna(axis=1) 


# Imputation:
# df['horsepower'] = df['horsepower'].fillna(7) # fill with constant


# median = df['horsepower'].median()
# df['horsepower'] = df['horsepower'].fillna(median) # → median

# df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mode()[0]) # → mode


# Forward/Backward fill: Not shown here
# Advanced: use sklearn.impute (KNN, IterativeImputer) not shown here

# Fix data types

In [314]:
print(df.dtypes)

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
name             object
dtype: object


## observation
When pandas loads data,
- In the beginning, pandas will use big memory consuming data types: 
- columns with decimal values -> float64
- columns with integer values -> int64
- columns with text values -> object

## Lets look at  the min and max value of our data: i.e. range

In [315]:
# This allows us to choose the right data type

print(df.describe().loc[['min', 'max']].T)

                 min     max
mpg              9.0    46.6
cylinders        3.0     8.0
displacement    68.0   455.0
horsepower      46.0   230.0
weight        1613.0  5140.0
acceleration     8.0    24.8
model_year      70.0    82.0


### observation:
- **mpg** ranges from min=9 to max=46.6.


And so on.

## Choose data type based on range of values in columns

# Cheatsheet

## For integer (whole numbers)

| Value Range         | Recommended dtype | When to use                     |
| ------------------- | ----------------- | ------------------------------- |
| 0 to 255            | `uint8`           | Flags, counts, small categories |
| -128 to 127         | `int8`            | Small signed values             |
| 0 to 65,535         | `uint16`          | IDs, years                      |
| -32,768 to 32,767   | `int16`           | Medium-range integers           |
| 0 to 4 billion      | `uint32`          | Large counts                    |
| Very large integers | `int64` (default) | Safe default                    |
---

## For float (decimal numbers)

| Precision Needed | Recommended dtype   | Example                 |
| ---------------- | ------------------- | ----------------------- |
| Low / medium     | `float32`           | MPG, prices, scores     |
| High precision   | `float64` (default) | Scientific calculations |


### Rule of thumb
- Use float32 unless you need very high precision
---


## For category

| Data pattern         | Recommended dtype | Example        |
| -------------------- | ----------------- | -------------- |
| Few repeated strings | `category`        | origin, color |
| Many unique strings  | `string`          | names, emails  |
---

## For Boolean (only 2 values)

| Values present | Recommended dtype |
| -------------- | ----------------- |
| True / False   | `bool`            |
| 0 / 1          | `bool`            |
| Yes / No       | map → `bool`      |
| Male / Female  | map → `bool`      |
---


### IDs & CODES (IMPORTANT TRAP)

| Looks like   | Actually is | dtype                  |
| ------------ | ----------- | ---------------------- |
| Numeric ID   | Identifier  | `string` or `category` |
| Zip code     | Label       | `string`               |
| Phone number | Label       | `string`               |

- **Never use int for IDs**

----
### A simple guide
```
Is it numeric?
 ├── No → category / string
 └── Yes
     ├── Whole numbers → int / uint
     └── Decimals → float32
```

### Lets check memory usage before we change data type

In [316]:
# I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 74.0 KB
None


In [317]:
## Memory usage of each columns

print(df.memory_usage(deep=True))

Index             128
mpg              3184
cylinders        3184
displacement     3184
horsepower       3184
weight           3184
acceleration     3184
model_year       3184
origin          24248
name            29092
dtype: int64


### MPG

In [318]:
# 1: range check

df['mpg'].min(), df['mpg'].max()

(9.0, 46.6)

In [319]:
# 2: Memory usage in bytes for mpg BEFORE changing datatype

print(df.memory_usage(deep=True)['mpg'])
print(df['mpg'].dtype)

3184
float64


In [320]:
# 3: conversion: For MPG, the range of values are from min=9.0 to max=46.6. 
# I do not need high precison here. So we change the datatype from float64 -> float16

df['mpg'] = df['mpg'].astype('float16')

In [321]:
# 4: Memory usage in bytes for HorsePower AFTER changing datatype

df.memory_usage(deep=True)['mpg']

796

In [322]:
# 5: percent reduction ? 75%
(3184 - 796)/3184

0.75

In [323]:
# 6: I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption: 74 -> 71.6 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float16
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float16(1), float64(3), int64(3), object(2)
memory usage: 71.6 KB
None


### cylinders

In [324]:
# 1) Now we focus on cylinders: It seems that this has finite numbers and is categorical
# Value counts of cylinder

print(df['cylinders'].value_counts())

cylinders
4    204
8    103
6     84
3      4
5      3
Name: count, dtype: int64


In [325]:
# 2) Memory usage in bytes for cylinders BEFORE changing datatype

print(df.memory_usage(deep=True)['cylinders'])
print(df['cylinders'].dtype)

3184
int64


In [326]:
# 3) For cylinders, the range of values are from min=3 to max=8 
# So change datatype from int64 -> uint8. 
# Do not change this to category, especially if you want to use ML model later.

df['cylinders'] = df['cylinders'].astype('uint8') # memory usage 3184 -> 398 bytes
# df['cylinders'] = df['cylinders'].astype('category') # memory usage 3184 -> 610 bytes 

In [327]:
# 4) Memory usage in bytes for HorsePower AFTER changing datatype

df.memory_usage(deep=True)['cylinders']

398

In [328]:
# How much reduction ? 87%
(3184-398)/3184

0.875

In [329]:
# I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: 74 -> 68.9

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float16
 1   cylinders     398 non-null    uint8  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float16(1), float64(3), int64(2), object(2), uint8(1)
memory usage: 68.9 KB
None


### displacement

In [330]:
# 1: range check

df['displacement'].min(), df['displacement'].max()

(68.0, 455.0)

In [331]:
# 2: Memory usage in bytes for displacement BEFORE changing datatype

print(df.memory_usage(deep=True)['displacement'])
print(df['displacement'].dtype)

3184
float64


In [332]:
# 3: For displacement, the range of values are from min=68 to max=455. All values are whole numbers

df['displacement'] = df['displacement'].astype('int16')

In [333]:
# 4: Memory usage in bytes for displacement AFTER changing datatype

df.memory_usage(deep=True)['displacement']

796

In [334]:
# I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: 74 -> 66.6

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float16
 1   cylinders     398 non-null    uint8  
 2   displacement  398 non-null    int16  
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float16(1), float64(2), int16(1), int64(2), object(2), uint8(1)
memory usage: 66.6 KB
None


### horsepower

In [335]:
# 1: range check

df['horsepower'].min(), df['horsepower'].max()

(46.0, 230.0)

In [336]:
# 2: Memory usage in bytes for HorsePower before changing datatype.

print(df.memory_usage(deep=True)['horsepower'])
print(df['horsepower'].dtype)

3184
float64


In [337]:
# 3: For horsepower, the range of values are from min=46 to max=230. All values are whole numbers
# I know from domain knowledge that values need would be whole numbers.

df['horsepower'] = df['horsepower'].astype("uint8") # 398 bytes
# df['horsepower'] = pd.to_numeric(df['horsepower'], downcast='integer') # This would not work here

In [338]:
# 4: Memory usage in bytes for HorsePower AFTER changing datatype

df.memory_usage(deep=True)['horsepower']

398

In [339]:
# I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: -> 64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float16
 1   cylinders     398 non-null    uint8  
 2   displacement  398 non-null    int16  
 3   horsepower    398 non-null    uint8  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float16(1), float64(1), int16(1), int64(2), object(2), uint8(2)
memory usage: 63.9 KB
None


### weight

In [340]:
# 1: range check

df['weight'].min(), df['weight'].max()

(1613, 5140)

In [341]:
# 2: Memory usage in bytes BEFORE changing datatype

df.memory_usage(deep=True)['weight']

3184

In [342]:
# 3: For weight, the range of values are from min=1613 to max=5140 

df['weight'] = df['weight'].astype('int16')

In [343]:
# 4: Memory usage in bytes AFTER changing datatype

df.memory_usage(deep=True)['weight']

796

In [344]:
# 5: I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: -> 61.5 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float16
 1   cylinders     398 non-null    uint8  
 2   displacement  398 non-null    int16  
 3   horsepower    398 non-null    uint8  
 4   weight        398 non-null    int16  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float16(1), float64(1), int16(2), int64(1), object(2), uint8(2)
memory usage: 61.5 KB
None


### acceleration

In [345]:
# 1: Range check

df['acceleration'].min(), df['acceleration'].max()

(8.0, 24.8)

In [346]:
# 2: Memory usage in bytes BEFORE changing datatype

df.memory_usage(deep=True)['acceleration']

3184

In [347]:
# 3: For acceleration, the range of values are from min=8.0 to max=24.8
# change datatype to float16

df['acceleration'] = df['acceleration'].astype('float16')

In [348]:
# 4: Memory usage in bytes AFTER changing datatype

df.memory_usage(deep=True)['acceleration']

796

In [349]:
# 5: I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: -> 59.2 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float16
 1   cylinders     398 non-null    uint8  
 2   displacement  398 non-null    int16  
 3   horsepower    398 non-null    uint8  
 4   weight        398 non-null    int16  
 5   acceleration  398 non-null    float16
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float16(2), int16(2), int64(1), object(2), uint8(2)
memory usage: 59.2 KB
None


### model_year

In [350]:
# 1: Range check

print(df['model_year'].min(), df['model_year'].max())

70 82


In [351]:
# 2: Memory usage in bytes BEFORE changing datatype

df.memory_usage(deep=True)['model_year']

3184

In [352]:
# 3: For model_year, the range of values are from min=70 to max=82

df['model_year'] = df['model_year'].astype('int8')

In [353]:
# 4: Memory usage in bytes AFTER changing datatype

df.memory_usage(deep=True)['model_year']

398

In [354]:
# 5: I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: -> 56.5 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float16
 1   cylinders     398 non-null    uint8  
 2   displacement  398 non-null    int16  
 3   horsepower    398 non-null    uint8  
 4   weight        398 non-null    int16  
 5   acceleration  398 non-null    float16
 6   model_year    398 non-null    int8   
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float16(2), int16(2), int8(1), object(2), uint8(2)
memory usage: 56.5 KB
None


### origin

In [355]:
# 1: Fix data type for column origin. It is presently object.
# step1: Check value counts of origin 

print(df["origin"].value_counts())

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64


In [356]:
# 2: Memory usage in bytes BEFORE changing datatype

df.memory_usage(deep=True)['origin']

24248

In [357]:
# 3: Change origin data type from object -> category

df['origin'] = df['origin'].astype('category')

In [358]:
# 4: Memory usage in bytes AFTER changing datatype

print(df.memory_usage(deep=True)['origin'])

691


In [359]:
# 5: I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: -> 34 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float16 
 1   cylinders     398 non-null    uint8   
 2   displacement  398 non-null    int16   
 3   horsepower    398 non-null    uint8   
 4   weight        398 non-null    int16   
 5   acceleration  398 non-null    float16 
 6   model_year    398 non-null    int8    
 7   origin        398 non-null    category
 8   name          398 non-null    object  
dtypes: category(1), float16(2), int16(2), int8(1), object(1), uint8(2)
memory usage: 33.5 KB
None


In [360]:
# How much saving ? 54 %
(74 - 33.5)/74

0.5472972972972973

# Side Note: This can be automated
- Here I can automate the process that converts all object columns to category if number of uniques values in that column is less than 50 % of total values.

In [371]:
# Convert object to Categorical Text Columns

obj_cols = df.select_dtypes(include=['object']).columns

for col in obj_cols:
    num_unique = df[col].nunique()
    num_total = len(df[col])
    if num_unique / num_total < 0.5:  # heuristic: if < 50% unique
        print(f"Converting {col} to datatype category")
        df[col] = df[col].astype('category')


# print(df.memory_usage(deep=True))
# print(f"Memory now: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

### name

In [362]:
print(df["name"].value_counts())

name
ford pinto             6
toyota corolla         5
amc matador            5
ford maverick          5
chevrolet chevette     4
                      ..
chevrolet monza 2+2    1
ford mustang ii        1
pontiac astro          1
amc pacer              1
chevy s-10             1
Name: count, Length: 305, dtype: int64


In [363]:
# 1: Drop the column name because it provides no useful information

df.drop(columns=['name'], inplace=True)

In [364]:
# 2: I want to see the info: Include deep memory usage (actual memory for object dtypes)

print(df.info(memory_usage="deep"))

# obser: There is small drop in memory consumption
# Memory usage went down: -> 5.1 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float16 
 1   cylinders     398 non-null    uint8   
 2   displacement  398 non-null    int16   
 3   horsepower    398 non-null    uint8   
 4   weight        398 non-null    int16   
 5   acceleration  398 non-null    float16 
 6   model_year    398 non-null    int8    
 7   origin        398 non-null    category
dtypes: category(1), float16(2), int16(2), int8(1), uint8(2)
memory usage: 5.1 KB
None


In [365]:
print(df.sample(10))

           mpg  cylinders  displacement  horsepower  weight  acceleration  \
115  15.000000          8           350         145    4082     13.000000   
308  33.500000          4           151          90    2556     13.203125   
79   26.000000          4            96          69    2189     18.000000   
138  14.000000          8           318         150    4457     13.500000   
259  20.796875          6           200          85    3070     16.703125   
102  26.000000          4            97          46    1950     21.000000   
39   14.000000          8           400         175    4464     11.500000   
13   14.000000          8           455         225    3086     10.000000   
321  32.187500          4           108          75    2265     15.203125   
85   13.000000          8           350         175    4100     13.000000   

     model_year  origin  
115          73     usa  
308          79     usa  
79           72  europe  
138          74     usa  
259          78     us

### No boolean data type. So do not work on this.

In [366]:
# We are done: Lets check how much memory saved: 93 %

(74 - 5.1) / 74

0.9310810810810811

## Saving the dataframe with data types

In [367]:
# (WRONG WAY) I want to save all the data and also its data type.
# Following would not do this.

df.to_csv('test.csv') # This is WRONG

In [368]:
# (RIGHT WAY) I save this in pickle file so I can preserve the data structure, data types, etc
# I save the dataset in a pickle file and this way when I load it
# back again  for bivariate analysis, it would preserve the data types

df.to_pickle('data_MPG_clean.pkl')

In [369]:
# Now Later you can load the above pickle file. It would remember the data types.
# You can perform univariate, bivariate, etc analysis

df2 = pd.read_pickle('data_MPG_clean.pkl')

print(df2.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float16 
 1   cylinders     398 non-null    uint8   
 2   displacement  398 non-null    int16   
 3   horsepower    398 non-null    uint8   
 4   weight        398 non-null    int16   
 5   acceleration  398 non-null    float16 
 6   model_year    398 non-null    int8    
 7   origin        398 non-null    category
dtypes: category(1), float16(2), int16(2), int8(1), uint8(2)
memory usage: 5.0 KB
None


# STOP