In [1]:
# The Problem: "Text" vs. "Numbers"
# In Data Science, this is the most common headache.
# You see: 100 (It looks like a number).
# Computer sees: "100" (It sees it as text, like a word).
# If you try to do math on text, Python gets confused:
# Math: 100 + 100 = 200
# Text: "100" + "100" = "100100" (It just glues the words together).
# You use .astype() to tell the computer: "Treat this column as a Number, not a Word."

# Wrong Data:
import pandas as pd

data = {
    "Product": ["Laptop", "Mouse", "Keyboard"],
    "Price": ["50000", "1500", "3000"],  # These are Strings (Quotes "")
    "Tax": ["50.5", "10.2", "20.0"]      # These are Strings too
}

df = pd.DataFrame(data)

print("--- Original Data ---")
print(df)
print("\n--- Check the Types ---")
print(df.dtypes)

--- Original Data ---
    Product  Price   Tax
0    Laptop  50000  50.5
1     Mouse   1500  10.2
2  Keyboard   3000  20.0

--- Check the Types ---
Product    object
Price      object
Tax        object
dtype: object


In [2]:
# Observation: Look at the output of df.dtypes.
# Price is object (Pandas speak for "Text/String").
# Tax is object.
# If you try df["Price"].sum(), it will fail or give a crazy result.

#            Converting to Integer (int):
# Integers are Whole Numbers (no decimals). We will convert "Price" to integer.
# Syntax: df["col"] = df["col"].astype(int)

# Change the type
df["Price"] = df["Price"].astype(int)
print("Price Type is now:", df["Price"].dtype)

# Now we can do math!
print("Total Price:", df["Price"].sum())

Price Type is now: int64
Total Price: 54500


In [3]:
# Converting to Float (float)
# Floats are Decimal Numbers. The "Tax" column has decimals (50.5), so we cannot use int (it would lose the .5). We must use float.

# Change the type
df["Tax"] = df["Tax"].astype(float)

print("Tax Type is now:", df["Tax"].dtype)
print("Average Tax:", df["Tax"].mean())

Tax Type is now: float64
Average Tax: 26.900000000000002


In [4]:
# Converting to String (str)
# Sometimes you want the opposite. Example: You have a "Year" column (2020, 2021).
# You don't want to calculate the "Average Year" (that makes no sense). You want to treat the Year as a label/category.

# Create a numeric year column
df["Year"] = [2020, 2021, 2022]

# Convert it to text
df["Year"] = df["Year"].astype(str)

print("Year Type is now:", df["Year"].dtype) # returns 'O' (Object)

Year Type is now: object


In [5]:
# Crucial Warning: The "Dirty Data" Trap
# .astype() is not magic. It is strict. If your column contains anything that is not a pure number, .astype(int) will crash with an error.
# Example Scenario: Imagine your price is "$500" instead of "500".
# User: astype(int)
# Pandas: "Error: invalid literal for int(): '$500'" (I don't know how to convert the $ symbol to a number).
# The Fix: You must clean the text first using .str.replace(), THEN convert it.

# Dirty Data
df_dirty = pd.DataFrame({"Cost": ["$500", "$200", "$100"]})

# 1. Remove the '$' symbol
df_dirty["Cost"] = df_dirty["Cost"].str.replace("$", "") # this will remove $

# 2. NOW convert to integer
df_dirty["Cost"] = df_dirty["Cost"].astype(int)

print(df_dirty)
print(df_dirty.dtypes)

   Cost
0   500
1   200
2   100
Cost    int64
dtype: object
