In [1]:
# From your virtul environment, install the pandas package for working with data frames: 
# pip install pandas
# pandas is an open source Python library for data analysis
# A data frame allows us to read and access data stored in spreadsheet (.csv) format
# Good tutorial on pandas: http://pandas.pydata.org/pandas-docs/stable/10min.html
# Pandas Cookbook: http://pandas.pydata.org/pandas-docs/stable/tutorials.html
# Another good resource for pandas: http://chrisalbon.com/python/pandas_dataframe_importing_csv.htm

#Import the required packages
#Import package pandas for data analysis
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib for visualisation/plotting jjo
import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
# Reading from a csv file, into a data frame
# Can set parameters to remove white space from the beginning and end of column names
# Read more about .read_csv() here: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
df = pd.read_csv('/Users/jennifercasavantes/Downloads/Census_Demographics_at_the_Neighborhood_Tabulation_Area__NTA__level_20250525.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
#How many rows should be displayed in full
pd.set_option('display.max_rows', 100)
# Show data frame first few rows
df.head(50)

Unnamed: 0,Geographic Area - Borough,Geographic Area - 2010 Census FIPS County Code,Geographic Area - Neighborhood Tabulation Area (NTA)* Code,Geographic Area - Neighborhood Tabulation Area (NTA)* Name,Total Population 2000 Number,Total Population 2010 Number,Total Population Change 2000-2010 Number,Total Population Change 2000-2010 Percent
0,Bronx,5.0,BX01,Claremont-Bathgate,28149.0,31078.0,2929.0,10.4
1,Bronx,5.0,BX03,Eastchester-Edenwald-Baychester,35422.0,34517.0,-905.0,-2.6
2,Bronx,5.0,BX05,Bedford Park-Fordham North,55329.0,54415.0,-914.0,-1.7
3,Bronx,5.0,BX06,Belmont,25967.0,27378.0,1411.0,5.4
4,Bronx,5.0,BX07,Bronxdale,34309.0,35538.0,1229.0,3.6
5,Bronx,5.0,BX08,West Farms-Bronx River,34542.0,35011.0,469.0,1.4
6,Bronx,5.0,BX09,Soundview-Castle Hill-Clason Point-Harding Park,50753.0,53686.0,2933.0,5.8
7,Bronx,5.0,BX10,Pelham Bay-Country Club-City Island,27140.0,26583.0,-557.0,-2.1
8,Bronx,5.0,BX13,Co-Op City,40676.0,43752.0,3076.0,7.6
9,Bronx,5.0,BX14,East Concourse-Concourse Village,58961.0,62284.0,3323.0,5.6


In [3]:
# Check how many rows and columns this dataframe has
df.shape

(197, 8)

In [4]:
df.dtypes

Geographic Area - Borough                                      object
Geographic Area - 2010 Census FIPS County Code                float64
Geographic Area - Neighborhood Tabulation Area (NTA)* Code     object
Geographic Area - Neighborhood Tabulation Area (NTA)* Name     object
Total Population 2000 Number                                  float64
Total Population 2010 Number                                  float64
Total Population Change 2000-2010 Number                      float64
Total Population Change 2000-2010 Percent                     float64
dtype: object

In [5]:
#another way to get a summary of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 8 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Geographic Area - Borough                                   197 non-null    object 
 1   Geographic Area - 2010 Census FIPS County Code              195 non-null    float64
 2   Geographic Area - Neighborhood Tabulation Area (NTA)* Code  195 non-null    object 
 3   Geographic Area - Neighborhood Tabulation Area (NTA)* Name  195 non-null    object 
 4   Total Population 2000 Number                                195 non-null    float64
 5   Total Population 2010 Number                                195 non-null    float64
 6   Total Population Change 2000-2010 Number                    195 non-null    float64
 7   Total Population Change 2000-2010 Percent                   194 non-null    float64
dtypes

In [6]:
# Look at column names. Some have spaces in or after the name, e.g., 'Insurance Type '.
df.columns

Index(['Geographic Area - Borough',
       'Geographic Area - 2010 Census FIPS County Code',
       'Geographic Area - Neighborhood Tabulation Area (NTA)* Code',
       'Geographic Area - Neighborhood Tabulation Area (NTA)* Name',
       'Total Population 2000 Number', 'Total Population 2010 Number',
       'Total Population Change 2000-2010 Number',
       'Total Population Change 2000-2010 Percent'],
      dtype='object')

In [7]:
# Clean the column names to remove white space after the name or in the name
df.columns = df.columns.str.replace(' ', '')
#df.columns = df.columns.str.replace(' ', '_')

In [8]:
df.columns

Index(['GeographicArea-Borough', 'GeographicArea-2010CensusFIPSCountyCode',
       'GeographicArea-NeighborhoodTabulationArea(NTA)*Code',
       'GeographicArea-NeighborhoodTabulationArea(NTA)*Name',
       'TotalPopulation2000Number', 'TotalPopulation2010Number',
       'TotalPopulationChange2000-2010Number',
       'TotalPopulationChange2000-2010Percent'],
      dtype='object')

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GeographicArea-2010CensusFIPSCountyCode,195.0,54.712821,28.320689,5.0,47.0,61.0,81.0,85.0
TotalPopulation2000Number,195.0,41068.092308,22569.317082,0.0,25635.0,35181.0,54178.5,136954.0
TotalPopulation2010Number,195.0,41923.758974,22282.970575,0.0,26172.0,36891.0,53896.0,132378.0
TotalPopulationChange2000-2010Number,195.0,855.666667,3292.093029,-10329.0,-700.0,455.0,2088.5,19611.0
TotalPopulationChange2000-2010Percent,194.0,5.213918,33.484918,-100.0,-1.85,1.5,6.875,349.5


In [11]:
df[df['GeographicArea-2010CensusFIPSCountyCode'] >= 200000]

Unnamed: 0,GeographicArea-Borough,GeographicArea-2010CensusFIPSCountyCode,GeographicArea-NeighborhoodTabulationArea(NTA)*Code,GeographicArea-NeighborhoodTabulationArea(NTA)*Name,TotalPopulation2000Number,TotalPopulation2010Number,TotalPopulationChange2000-2010Number,TotalPopulationChange2000-2010Percent


In [12]:
numeric_columns = df.select_dtypes(['int64', 'float64']).columns
numeric_columns

Index(['GeographicArea-2010CensusFIPSCountyCode', 'TotalPopulation2000Number',
       'TotalPopulation2010Number', 'TotalPopulationChange2000-2010Number',
       'TotalPopulationChange2000-2010Percent'],
      dtype='object')

In [18]:
import pandas as pd

# Step 1: Load your dataset (update the path or URL if needed)
df = pd.read_csv("/Users/jennifercasavantes/Downloads/Census_Demographics_at_the_Neighborhood_Tabulation_Area__NTA__level_20250525.csv")  # Replace with your actual filename

# Step 2: Clean column names (removes spaces, tabs, invisible characters)
df.columns = df.columns.str.strip()

# Step 3: (Optional) See all column names to double-check spelling
print("Column names in dataset:")
print(df.columns.tolist())

# Step 4: Define feature types after cleaning
categorical_cols = [
    'Geographic Area - Borough',
    'Geographic Area - Neighborhood Tabulation Area (NTA)* Name'
]

continuous_cols = [
    'Total Population 2000 Number',
    'Total Population 2010 Number',
    'Total Population Change 2000-2010 Number',
    'Total Population Change 2000-2010 Percent'
]

# Step 5: Print out data types of each group
print("\nCategorical Columns:")
print(df[categorical_cols].dtypes)

print("\nContinuous Columns:")
print(df[continuous_cols].dtypes)

Column names in dataset:
['Geographic Area - Borough', 'Geographic Area - 2010 Census FIPS County Code', 'Geographic Area - Neighborhood Tabulation Area (NTA)* Code', 'Geographic Area - Neighborhood Tabulation Area (NTA)* Name', 'Total Population 2000 Number', 'Total Population 2010 Number', 'Total Population Change 2000-2010 Number', 'Total Population Change 2000-2010 Percent']

Categorical Columns:
Geographic Area - Borough                                     object
Geographic Area - Neighborhood Tabulation Area (NTA)* Name    object
dtype: object

Continuous Columns:
Total Population 2000 Number                 float64
Total Population 2010 Number                 float64
Total Population Change 2000-2010 Number     float64
Total Population Change 2000-2010 Percent    float64
dtype: object
