## Making sense of columns

In [56]:
import seaborn as sns
import pandas as pd

### Rows & Columns

In [57]:
# List of lists without column names
data = [
    ['John', 25, 'NY'],
    ['Emma', 30, 'LA'],
    ['Alex', 22, 'SF'],
    ['Sarah', 35, 'CHI']
]

In [58]:
# Without columns parameter - pandas uses numbers as column names
df1 = pd.DataFrame(data)
print("Without columns parameter:")
print(f"Shape: {df1.shape} in the format: (rows, columns)")
print(df1)
print("\n")

Without columns parameter:
Shape: (4, 3) in the format: (rows, columns)
       0   1    2
0   John  25   NY
1   Emma  30   LA
2   Alex  22   SF
3  Sarah  35  CHI




In [59]:
# Same data WITH columns parameter
df2 = pd.DataFrame(data, columns=['Name', 'Age', 'City'])
print("With columns parameter:")
print(df2)
print("\n")

With columns parameter:
    Name  Age City
0   John   25   NY
1   Emma   30   LA
2   Alex   22   SF
3  Sarah   35  CHI




In [60]:
# Notice that columns only names the columns, not the index
print("The index is still numbers:")
print("Index:", df2.index)
print("Columns:", df2.columns)
print("\n")

The index is still numbers:
Index: RangeIndex(start=0, stop=4, step=1)
Columns: Index(['Name', 'Age', 'City'], dtype='object')




In [61]:
# If you want to change the index, that's separate:
df3 = pd.DataFrame(data, 
                  columns=['Name', 'Age', 'City'],
                  index=['A', 'B', 'C', 'D'])
print("With both custom columns and index:")
print(df3)
print("\n")

With both custom columns and index:
    Name  Age City
A   John   25   NY
B   Emma   30   LA
C   Alex   22   SF
D  Sarah   35  CHI




In [62]:
# When using dictionary method, columns parameter can reorder or select columns
dict_data = {
    'Name': ['John', 'Emma', 'Alex'],
    'Age': [25, 30, 22],
    'City': ['NY', 'LA', 'SF']
}

In [63]:
# Reorder columns
df4 = pd.DataFrame(dict_data, columns=['Age', 'Name', 'City'])
print("Reordered columns using columns parameter:")
print(df4)
print("\n")

Reordered columns using columns parameter:
   Age  Name City
0   25  John   NY
1   30  Emma   LA
2   22  Alex   SF




In [64]:
# Select only certain columns
df5 = pd.DataFrame(dict_data, columns=['Name', 'Age'])  # Omit 'City'
print("Selected subset of columns:")
print(df5)

Selected subset of columns:
   Name  Age
0  John   25
1  Emma   30
2  Alex   22


In [65]:
# Select just a single column will still create a DataFrame
df6 = pd.DataFrame(dict_data, columns=['Age'])  # Omit 'City'
print(type(df6))
print(df6.shape)
print("Selected subset of columns:")
print(df5)

<class 'pandas.core.frame.DataFrame'>
(3, 1)
Selected subset of columns:
   Name  Age
0  John   25
1  Emma   30
2  Alex   22


In [66]:
# ...but indexing with a column name will create a Series:
df7 = df5['Age']
print(type(df7))
print(df6.shape)
print(df7)

<class 'pandas.core.series.Series'>
(3, 1)
0    25
1    30
2    22
Name: Age, dtype: int64
