In [None]:
#import the pandas library. I like to use the full names of my libraries in my code so it is easier to read

import pandas as pandas

#import the pandas.api -- We will use this in one of the regular expression cleans later on
from pandas.api import types as pdtypes

# Upload your file to a variable name of your choice

your_file = pandas.read_csv('file_path')

## This command allows me to see all of the columns in the dataframe when I call the .head() method

pandas.set_option('display.max_columns', None)

# Use the .head() method to see if your file loaded properly

your_file.head()

## This code block removes rows that don't meet a specific requirement.
## In this example you will see that I am removing any row where the canvass status is NOT "contact"

In [None]:
# This code has nested instructions. First, it looks for the column "canvas_status"
# Second, it searches every row where the status is NOT "contact"
# Third, it pulls pulls the index of the row that did not have the "contact" status
# Fourth, it uses the .drop() method to drop those rows by using the index of the row

your_file = your_file.drop((your_file[your_file['canvas_status'] != 'contact'].index))


#Let's see if it worked
your_file.head()


Let's break down how this worked! .drop() method in pandas expects row labels (index values) or column names as its argument, not a boolean mask.

--> *df['column_name'] == 'specific_value'* creates a boolean mask where True indicates rows that meet the condition.
--> *df[df['column_name'] == 'specific_value']* selects these rows from the DataFrame.
--> *.index* retrieves the index labels of these selected rows.
--> *.drop()* drops the rows whose index shows up after the boolean mask

In [None]:
# Drop the columns you need to drop
# You will need to name every column you want to drop

your_file = your_file.drop(columns=['column1',
        'column2',
        'column2'])

#Let's see if it worked
your_file.head()

In [None]:
# Let's rename the columns to make our dataframe easier to read. We will use the .rename() method
# The .rename() method requires a dictionary format for renaming
# EXAMPLE: df = df.rename(columns={'old_name1': 'new_name1', 'old_name2': 'new_name2'})

your_file = your_file.rename(columns={
    'old_column_name1':'new_column_name1',
    'old_column_name2':'new_column_name2',}
    )

#Let's see if it worked
your_file.head()

# If I need to keep track of column names, for example
    # If my column names are survey questions but I want to assign neater names to those columns
# I will create a separate file that stores the original column name and the new column name. Like a key!
# So I can reference the key in the future! 

In [None]:
# Let's use the dictionary from the .rename() method above and save it as a new variable

your_file_dictionary = {
    'question1':'shortened_column_name1',
    'question2':'shortened_column_name2',
}
# Let's see if the dictionary saved properly
print(your_file_dictionary)


In [None]:
# Now turn the dictionary from before into a dataframe we can save to csv.

your_file_dictionary_dataframe = pandas.DataFrame.from_dict(
    your_file_dictionary,
    orient='index')

your_file_dictionary_dataframe.head()


In [None]:
#Create a file with the column names that you can use later as a reference if needed

your_file_dictionary_dataframe.to_csv('new_file_path')

# Now we want to iterate through the data in each column to make sure it's formatted properly.
# In this example, we will only be removing special characters from number columns
# 

In [None]:
# Let's start by creating a list of column names. 
## Assign the variable "column_list" a list of the column names you want to work on. 
## This variable will be used later!

column_list = list(your_file.columns)

#Make sure the columns are listed by printing the list
print(column_list)


In [None]:
# Because we will be using regex to clean up the data in these columns,
# Make sure that each of the columns in the list are an object/string type.

for column in column_list:
    if column in your_file.columns:
        your_file[column] = your_file[column].astype(str)

#Use the .info() method to make sure it worked!
your_file.info()

In [None]:
# Now, let's create a function that will clean our data!

# Create a function called "clean_column" that takes one argument "column" and
    # first tests if the datatype is string
        # and processes a str.replace() method on the argument "column"
    # then returns the output as "column"
    ## It will only do this on columns whose data type is "string" a.k.a. "object" in pandas

def clean_column(column):
    if pandas.api.types.is_string_dtype(column):
        #Use any regex values you need to serve your project
        return column.str.replace(r"[\[\]'\"]", "", regex=True)
    else:
        print("Column {column} is not string type, skipping")
    return column

*pandas.api.types* is a submodule of pandas that provides a collection of data type-related functions and utilities. Here's a breakdown:

**Namespace:** It's a way to organize related functionality within the pandas library.

**Purpose:** This submodule contains functions for working with, checking, and manipulating data types in pandas.

**Common uses**:
-->Checking data types of Series or DataFrame columns
-->Determining if a data type belongs to a certain category (e.g., numeric, string, etc.)
-->Converting between different data types

**Some common functions in this module:**

-->*is_numeric_dtype()*: Checks if a dtype is numeric
-->*is_datetime64_any_dtype()*: Checks if a dtype is any kind of datetime64 dtype
-->*is_categorical_dtype()*: Checks if a dtype is of the Categorical type
-->*is_string_dtype()*: Checks if a dtype is a string type

In [None]:
# Use an if statement that applies the "clean_column()"" function to iterate over each column in "column_list"
    #The first line tests If column is in the dataframe
        #The second line runs the "clean_column" function on the dataframe using the label in the "column_list" list
    #If it doesnt work, it prints a statement that says it didn't work 

for column in column_list:
    if column in your_file.columns:
        your_file[column] = clean_column(your_file[column])
    else:
        print(f"Warning: Column '{column}' not found in the DataFrame")

#Let's see if it worked
your_file.head()

In [None]:
#Lets reassign any columns with data that is numeric back to integer type since we are not working with floats.

columns_to_convert_to_int = ['column1',
    'column2',
    'column3'
    ]

#Iterate over the items in the list
for column in columns_to_convert_to_int:
    your_file[column] = pandas.to_numeric(your_file[column], errors='coerce').astype('Int64')
    
#Let's make sure it worked
your_file.info()

In [None]:

#Let's make sure the previous code worked
your_file.head()

In [None]:

#Now save your new file to a new path
your_file.to_csv('new_file_path')